In [1]:
import pandas as pd
import pathlib
import matplotlib.pyplot as plt

In [6]:
interim = pathlib.Path().cwd().parent.parent / 'data' / 'interim'
ref = pathlib.Path().cwd().parent.parent / 'reference'

In [90]:
df = (pd.read_csv(interim / 'public.csv', sep=',')
      [['date_report', 'sex', 'yob', 'addr_dist_home', 'name_full']]
     )

pop = pd.read_csv(ref / 'pop_1.csv', sep=',', dtype={'id_addiv': 'str'}) # pop by adh
addiv = pd.read_csv(ref / 'addiv.csv', sep=',', dtype={'id_addiv': 'str', 'of_addiv': 'str'})

In [91]:
df = df.assign(
    date_report = pd.to_datetime(df.date_report, errors='coerce', format='%Y/%m/%d'),
    age = 2021 - df.yob.astype('float'),
    adult = (2021 - df.yob.astype('float')) > 18
)

df['age_group'] = pd.cut(
    df.age,
    bins = [0, 17, 45, 65, 200],
    labels = ['0 - 16', '17 - 44', '45 - 64', '> 65'],
    right = False
)

In [169]:
pop

Unnamed: 0,id_addiv,pop,pop_NAM,pop_NU,pop_TT,pop_TT_NAM,pop_TT_NU,pop_NT,pop_NT_NAM,pop_NT_NU
0,79,8993082,4381242,4611840,7125493,3448709,3676784,1867589,932533,935056
1,760,142625,65928,76697,142625,65928,76697,0,0,0
2,26734,23258,10718,12540,23258,10718,12540,0,0,0
3,26737,14970,6998,7972,14970,6998,7972,0,0,0
4,26740,10633,4999,5634,10633,4999,5634,0,0,0
...,...,...,...,...,...,...,...,...,...,...
342,27670,5637,2929,2708,0,0,0,5637,2929,2708
343,27673,13281,6727,6554,0,0,0,13281,6727,6554
344,27676,4512,2523,1989,0,0,0,4512,2523,1989
345,27679,10715,5430,5285,0,0,0,10715,5430,5285


In [185]:
def get_no_case(data_in, pop=10000000, rolling=7):
    # Default population 10 000 000
    df = (
        data_in[['date_report']]
        .groupby(['date_report'])
        .apply(lambda x: len(x))
        .to_frame(name='no_case')
        .reindex(pd.date_range(start=data_in.date_report.min(), end=data_in.date_report.max(), freq='D'))
        .fillna(0)
    )
    df['pop'] = pop
    df['no_case_ppop'] = round(df['no_case'] /df['pop'] * 100000, 2)
    df = df.drop(columns='pop')
    name = str(rolling) + 'd'
    df['no_case_rollmean' + name ] = df.no_case.rolling(7).mean()
    df['no_case_ppop_rmean' + name] = df.no_case_ppop.rolling(7).mean()
    df['no_case_cumsum'] = df.no_case.cumsum()
    df['no_case_ppop_cumsum'] = df.no_case_ppop.cumsum()
    return df

def get_no_case_by_group(data_in, group=[], rolling=7, available_pop=False):
    # WARNING list of group with 1 element
    # Require global var pop, global addiv
    unique = data_in[group[0]].unique()
    df_1 = (
        data_in[['date_report', group[0]]]
        .groupby(['date_report', group[0]])
        .apply(lambda x: len(x))
        .to_frame(name='no_case')
        .unstack(fill_value=0)
        .asfreq('D', fill_value=0)
        .stack()
        .sort_index(level=0)
        .reset_index()
    )

    df_pv = (
        df_1[['date_report', group[0], 'no_case']]
        .pivot(index='date_report', columns=group[0], values='no_case')
    ).fillna(0)
    
    df_2 = (
        df_pv.reindex(pd.date_range(
            start=df_pv.index.min(),
            end=df_pv.index.max(),
            freq='D'))
        .fillna(0)
        .stack(list(range(0, len(group))))
        .reset_index()
        .rename(columns={
            'level_0': 'date_report',
            0: 'no_case'
        })
#         .set_index(['date_report'] + group)
    )
    
    # Get df per population
    if available_pop:
        df_pop = pop[['id_addiv', 'pop']]
        df_3 = (
            df_2.merge(
                df_pop,
                how= 'left',
                left_on=group[0],
                right_on= 'id_addiv')
            .drop(columns=['id_addiv'])
        )
    
        df_3['no_case_ppop'] = df_3['no_case'] / df_3['pop'] * 100000
        df_3 = df_3.drop(columns='pop')
        name = str(rolling) + 'd'
        df_3['no_case_ppop_rollmean' + name] = (
            df_3.groupby(group)['no_case_ppop']
            .transform(lambda x: x.rolling(rolling).mean())
        )
    
        df_3['no_case_ppop_cumsum'] = (
            df_3[['date_report', group[0], 'no_case_ppop']]
            .groupby(['date_report', group[0]]).sum()
            .groupby(level=1).cumsum().reset_index()
            ['no_case_ppop']
        )
        df_3['no_case_rollmean' + name] = (
            df_3.groupby(group)['no_case']
            .transform(lambda x: x.rolling(rolling).mean())
        )
        df_3['no_case_cumsum'] = (
            df_3[['date_report', group[0], 'no_case']]
            .groupby(['date_report', group[0]]).sum()
            .groupby(level=1).cumsum().reset_index()
            ['no_case']
        )
        return df_3
    else:
        name = str(rolling) + 'd'
        df_2['no_case_rollmean' + name] = (
            df_2.groupby(group)['no_case']
            .transform(lambda x: x.rolling(rolling).mean())
        )
        df_2['no_case_cumsum'] = (
            df_2[['date_report', group[0], 'no_case']]
            .groupby(['date_report', group[0]]).sum()
            .groupby(level=1).cumsum().reset_index()
            ['no_case']
        )
        return df_2
    # Get district name (optional)
    # df = (
    #     df.merge(
    #         addiv[['id_addiv', 'name_addiv_2']],
    #         how = 'left',
    #         left_on = 'addr_dist_home',
    #         right_on = 'id_addiv'
    #     )
    #     .drop(columns=['addr_dist_home', 'id_addiv'])
    #     .rename(columns={'name_addiv_2': 'addr_dist_home'})
    # )


In [102]:
# get no_case
pop_total = int(pop[pop.id_addiv == '79']['pop'][0])
no_case = get_no_case(df, pop=pop_total)
# no_case['no_case_rmean'] = no_case.no_case.rolling(7).mean()
# no_case['no_case_ppop_rmean'] = no_case.no_case_ppop.rolling(7).mean()
# no_case['no_case_cumsum'] = no_case.no_case.cumsum()
# no_case['no_case_ppop_cumsum'] = no_case.no_case_ppop.cumsum()

In [182]:
data_in_get_inc_by_group_adh = (
    df[df.addr_dist_home != 'UNKN']
)
no_case_by_adh = get_no_case_by_group(
    data_in_get_inc_by_group_adh,
    group=['addr_dist_home'],
    available_pop=True)


In [186]:
data_in_get_inc_by_group_sex = (
    df[df.sex != 'NAN']
)
no_case_by_sex = get_no_case_by_group(
    data_in_get_inc_by_group_sex,
    group=['sex'],
    available_pop=False)

In [187]:
# no_case_by_adh
# pop.info()
# no_case_by_adh[no_case_by_adh.addr_dist_home == '764'].head(30)
# no_case
no_case_by_sex

Unnamed: 0,date_report,sex,no_case,no_case_rollmean7d,no_case_cumsum
0,2021-05-27,NAM,19,,19
1,2021-05-27,NU,17,,17
2,2021-05-28,NAM,14,,33
3,2021-05-28,NU,17,,34
4,2021-05-29,NAM,21,,54
...,...,...,...,...,...
157,2021-08-13,NU,2529,2092.142857,76370
158,2021-08-14,NAM,1979,1877.000000,71226
159,2021-08-14,NU,2205,2114.428571,78575
160,2021-08-15,NAM,1488,1784.714286,72714


In [7]:
inc = (
    df[['date_report']]
    .groupby(['date_report'])
    .apply(lambda x: len(x))
    .to_frame(name='case')
    .reindex(pd.date_range(start=df.date_report.min(), end=df.date_report.max(), freq='D'))
    .fillna(0)
      )
inc['case_rm'] = inc.case.rolling(7).mean()
# inc['case_rs'] = inc.case.rolling(7).sum()

# inc['case_rs'] = inc.rolling(7).sum()

In [10]:
def calculate_incidence(df, group_col, roll_mean=False, pivot=True):
    inc = (
        df[['date_report', group_col]]
        .groupby(['date_report', group_col])
        .apply(lambda x: len(x))
        .to_frame(name='case')
        .unstack(fill_value=0)
        .asfreq('D', fill_value=0)
        .stack()
        .sort_index(level=0)
        .reset_index(1)
    )
    
    if roll_mean:
        inc['case_rm'] = (
            inc.groupby(group_col)['case']
            .transform(lambda x: x.rolling(7).mean())
        )

    inc = inc.reset_index(0)
    
    if pivot:
        inc = inc.pivot(index='date_report', columns=group_col, values=['case'])
#         col = inc[group_col].unique()
#         inc.columns = col
        
    return inc

In [19]:
inc_adh_rollmean = calculate_incidence(df, 'addr_dist_home', roll_mean=True, pivot=False)
inc_adh_cumsum = (
    calculate_incidence(df, 'addr_dist_home', roll_mean=False, pivot=False)
    .groupby(['addr_dist_home', 'date_report'])
    .sum()
    .groupby(level=0)
    .cumsum()
    .reset_index()
)
inc_sex = calculate_incidence(df, 'sex', roll_mean=False, pivot=True)
inc_sex.columns = ['female', 'male']
inc_adult = calculate_incidence(df, 'adult', roll_mean=False, pivot=True)
inc_adult.columns = ['lte18', 'gt18']

In [23]:
# inc_adh_cumsum[inc_adh_cumsum.addr_dist_home == 'QA08']
# inc_adult

In [None]:
inc_sex = (df[['date_report', 'sex']]
           [(df.sex == 'NAM') | (df.sex == 'NỮ')]
           .groupby(['date_report', 'sex'])
           .apply(lambda x: len(x))
           .to_frame(name='case')
          )

inc_sex = inc_sex.reset_index()
inc_sex = inc_sex.pivot(index='date_report', columns='sex', values='case')
inc_sex.columns = ['male', 'female']

In [None]:
inc_adult = (df[['date_report', 'adult']]
           .groupby(['date_report', 'adult'])
           .apply(lambda x: len(x))
           .to_frame(name='case')
          )

inc_adult = inc_adult.reset_index()
inc_adult = inc_adult.pivot(index='date_report', columns='adult', values='case')
inc_adult.columns = ['gt18', 'lte18']

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
# ax.bar(inc.index, inc.case, label='Số ca mới theo ngày', linewidth=3, color='gray')
ax.bar(inc_sex.index, inc_sex.male, label='Nam', color='blue', alpha=0.4)
ax.bar(inc_sex.index, inc_sex.female, bottom=inc_sex.male, label='Nữ', color='red', alpha=0.4)
ax.plot(inc.index, inc.case_rm, label='Biến động trung bình 7 ngày', linewidth=3, color='black')
# ax.plot(inc_sex.index, inc_sex.male, label='Nam', linewidth=2, color='blue')
# ax.plot(inc_sex.index, inc_sex.female, label='Nữ', linewidth=2, color='red')
ax.set_ylabel('Số ca mới')
ax.set_xlabel('Ngày')
ax.set_xticks(inc.index)
ax.tick_params(axis='x', labelrotation=90)
ax.set_title('Số ca mới theo ngày và biến động trung bình 7 ngày')
ax.legend()

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.bar(inc.index, inc.case, label='Số ca mới theo ngày', linewidth=3, color='gray')
ax.plot(inc.index, inc.case_rm, label='Biến động trung bình 7 ngày', linewidth=3, color='black')
ax.plot(inc_adult.index, inc_adult.gt18, label='> 18', linewidth=2, color='blue')
ax.plot(inc_adult.index, inc_adult.lte18, label='<= 18', linewidth=2, color='red')
ax.set_ylabel('Số ca mới')
ax.set_xlabel('Ngày')
ax.set_xticks(inc.index)
ax.tick_params(axis='x', labelrotation=90)
ax.set_title('Số ca mới theo ngày và biến động trung bình 7 ngày')
ax.legend()

In [None]:
inc