last update: 2021-09-06

In [1]:
import pandas as pd
import pathlib

In [2]:
src = pathlib.Path().cwd().parent.parent / 'data' / 'interim'
ref = pathlib.Path().cwd().parent.parent / 'reference'

In [46]:
df = pd.read_csv(src / 'public-data.csv')
pop = pd.read_csv(ref / 'pop_1.csv', sep=',', dtype={'id_addiv': 'str'})
addiv = pd.read_csv(ref / 'addiv.csv', sep=',', dtype={'id_addiv': 'str', 'of_addiv': 'str'})


In [18]:
df['date_report'] = pd.to_datetime(df['date_report'])

In [19]:
# df.head()
df.info()
# df.addr_prov_home.unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246257 entries, 0 to 246256
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   name_full        246257 non-null  object        
 1   sex              246257 non-null  object        
 2   yob              245863 non-null  float64       
 3   addr_prov_home   246257 non-null  int64         
 4   addr_dist_home   246257 non-null  object        
 5   addr_ward_home   239204 non-null  float64       
 6   addr_home        246247 non-null  object        
 7   date_report      246176 non-null  datetime64[ns]
 8   date_positive    242164 non-null  object        
 9   place_recognize  246257 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 18.8+ MB


In [29]:
def get_no_event(data, date_col='date_report', no_col='no_test', pop=10000000, rolling=7):
    """
    Count number of (no) event by date
    
    Args:
        data: Data frame, input data frame
        date_col: String, name of date column, default 'date_report'
        no_col: String, name of event count column, default 'no_case'
        rolling: Int, rolling window, default 7
        pop: Int, population, default 10000000
    
    Return:
        A data frame
    """
    # number of events by date, reindex to fill missing date entry with 0
    df = (
        data[[date_col]]
        .groupby(date_col)
        .apply(lambda x: len(x))
        .to_frame(name=no_col)
        .reindex(pd.date_range(start=data[date_col].min(), end=data[date_col].max(), freq='D'))
        .fillna(0)
    )
    # population column
    df['pop'] = pop
    # name of columns to be created
    col_per_pop = no_col + '_ppop'
    col_rollmean = no_col + '_rollmean' + str(rolling) + 'd'
    col_per_pop_rollmean = no_col + '_ppop_rollmean' + str(rolling) + 'd'
    col_cumsum = no_col + '_cumsum'
    col_per_pop_cumsum = no_col + '_ppop_cumsum'
    # number of events per population by date
    df[col_per_pop] = round(df[no_col] /df['pop'] * 100000, 3)
    # rolling mean number of events by date
    df[col_rollmean] = df[no_col].rolling(rolling).mean()
    # rolling mean number of events per population by date
    df[col_per_pop_rollmean] = df[col_per_pop].rolling(7).mean()
    # cumulative sum number of events by date
    df[col_cumsum] = df[no_col].cumsum()
    # cumulative sum number of events per population by date
    df[col_per_pop_cumsum] = df[col_per_pop].cumsum()
    df = df.drop(columns='pop')
    return df

In [32]:
pop_total = int(pop[pop.id_addiv == '79']['pop'][0])
no_case = get_no_case(data=df, pop=pop_total,
                      date_col='date_report', no_col='no_case', rolling=7)

In [34]:
# no_case

In [44]:
def get_no_case_by_group(data, group_col, date_col='date_report', no_col='no_case',
                         available_pop=False, getname=False, rolling=7):
    """
    Count number of (no) event by group and date
    
    Args:
        data: Data frame, input data frame
        date_col: String, name of date column, default date_col
        no_col: String, name of event count column, default no_col
        rolling: Int, rolling window, default 7
        available_pop: Boolean, if population data frame by group is available, default False
        getname: Boolean, to get group label if it exists, default False
    
    Return:
        A data frame
    """
    unique = data[group_col].unique()
    # number of events by group and date
    df_1 = (
        data[[date_col, group_col]]
        .groupby([date_col, group_col])
        .apply(lambda x: len(x))
        .to_frame(name=no_col)
        .unstack(fill_value=0)
        .asfreq('D', fill_value=0)
        .stack()
        .sort_index(level=0)
        .reset_index()
    )
    # pivot table to fill missing group with 0
    df_pv = (
        df_1[[date_col, group_col, no_col]]
        .pivot(index=date_col, columns=group_col, values=no_col)
    ).fillna(0)
    # fill missing date with 0
    df_2 = (
        df_pv.reindex(pd.date_range(
            start=df_pv.index.min(),
            end=df_pv.index.max(),
            freq='D'))
        .fillna(0)
        .stack(list(range(0, 1)))
        .reset_index()
        .rename(columns={
            'level_0': date_col,
            0: no_col
        })
#         .set_index([date_col] + group)
    )
    
    # name of columns to be created
    col_per_pop = no_col + '_ppop'
    col_rollmean = no_col + '_rollmean' + str(rolling) + 'd'
    col_per_pop_rollmean = no_col + '_ppop_rollmean' + str(rolling) + 'd'
    col_cumsum = no_col + '_cumsum'
    col_per_pop_cumsum = no_col + '_ppop_cumsum'
    
    # get no_case_rollmean
    df_3 = df_2
    df_3[col_rollmean] = (
        df_3.groupby(group_col)[no_col]
        .transform(lambda x: x.rolling(rolling).mean())
    )
    # get no_case cumsum
    df_3[col_cumsum] = (
        df_3[[date_col, group_col, no_col]]
        .groupby([date_col, group_col]).sum()
        .groupby(level=1).cumsum().reset_index()
        [no_col]
    )
    
    # Get number of events per population
    if available_pop:
        df_pop = pop[['id_addiv', 'pop']]
        # merge with population
        df_3 = (
            df_3.merge(
                df_pop,
                how= 'left',
                left_on=group_col,
                right_on= 'id_addiv')
            .drop(columns=['id_addiv'])
        )
        # number of events per population by group
        df_3[col_per_pop] = round(df_3[no_col] / df_3['pop'] * 100000, 3)
        df_3 = df_3.drop(columns='pop')
        # rolling mean number of events per population by group
        df_3[col_per_pop_rollmean] = (
            df_3.groupby(group_col)[col_per_pop]
            .transform(lambda x: x.rolling(rolling).mean())
        )
        # cumulative sum number of events per population by group
        df_3[col_per_pop_cumsum] = (
            df_3[[date_col, group_col, col_per_pop]]
            .groupby([date_col, group_col]).sum()
            .groupby(level=1).cumsum().reset_index()
            [col_per_pop]
        )

    # Get district name (optional)
    if getname:
        df_3 = (
            df_3.merge(
                addiv[['id_addiv', 'name_addiv_2']],
                how = 'left',
                left_on = group_col,
                right_on = 'id_addiv'
            )
            .drop(columns=[group_col, 'id_addiv'])
            .rename(columns={'name_addiv_2': group_col})
        )
    return df_3

In [51]:
def get_no_event_by_group(data,
                         group_col,
                         date_col='date_report',
                         no_col='no_test',
                         available_pop=False,
                         getname=False, rolling=7):
    """
    Count number of (no) event by group and date
    
    Args:
        data: Data frame, input data frame
        date_col: String, name of date column, default date_col
        no_col: String, name of event count column, default no_col
        rolling: Int, rolling window, default 7
        available_pop: Boolean, if population data frame by group is available, default False
        getname: Boolean, to get group label if it exists, default False
    
    Return:
        A data frame
    """
    unique = data[group_col].unique()
    # number of events by group and date
    df_1 = (
        data[[date_col, group_col]]
        .groupby([date_col, group_col])
        .apply(lambda x: len(x))
        .to_frame(name=no_col)
        .unstack(fill_value=0)
        .asfreq('D', fill_value=0)
        .stack()
        .sort_index(level=0)
        .reset_index()
    )
    # pivot table to fill missing group with 0
    df_pv = (
        df_1[[date_col, group_col, no_col]]
        .pivot(index=date_col, columns=group_col, values=no_col)
    ).fillna(0)
    # fill missing date with 0
    df_2 = (
        df_pv.reindex(pd.date_range(
            start=df_pv.index.min(),
            end=df_pv.index.max(),
            freq='D'))
        .fillna(0)
        .stack(list(range(0, 1)))
        .reset_index()
        .rename(columns={
            'level_0': date_col,
            0: no_col
        })
#         .set_index([date_col] + group)
    )
    
    # name of columns to be created
    col_per_pop = no_col + '_ppop'
    col_rollmean = no_col + '_rollmean' + str(rolling) + 'd'
    col_per_pop_rollmean = no_col + '_ppop_rollmean' + str(rolling) + 'd'
    col_cumsum = no_col + '_cumsum'
    col_per_pop_cumsum = no_col + '_ppop_cumsum'
    
    # get no_case_rollmean
    df_3 = df_2
    df_3[col_rollmean] = (
        df_3.groupby(group_col)[no_col]
        .transform(lambda x: x.rolling(rolling).mean())
    )
    # get no_case cumsum
    df_3[col_cumsum] = (
        df_3[[date_col, group_col, no_col]]
        .groupby([date_col, group_col]).sum()
        .groupby(level=1).cumsum().reset_index()
        [no_col]
    )
    
    # Get number of events per population
    if available_pop:
        df_pop = pop[['id_addiv', 'pop']]
        # merge with population
        df_3 = (
            df_3.merge(
                df_pop,
                how= 'left',
                left_on=group_col,
                right_on= 'id_addiv')
            .drop(columns=['id_addiv'])
        )
        # number of events per population by group
        df_3[col_per_pop] = round(df_3[no_col] / df_3['pop'] * 100000, 3)
        df_3 = df_3.drop(columns='pop')
        # rolling mean number of events per population by group
        df_3[col_per_pop_rollmean] = (
            df_3.groupby(group_col)[col_per_pop]
            .transform(lambda x: x.rolling(rolling).mean())
        )
        # cumulative sum number of events per population by group
        df_3[col_per_pop_cumsum] = (
            df_3[[date_col, group_col, col_per_pop]]
            .groupby([date_col, group_col]).sum()
            .groupby(level=1).cumsum().reset_index()
            [col_per_pop]
        )

    # Get district name (optional)
    if getname:
        df_3 = (
            df_3.merge(
                addiv[['id_addiv', 'name_addiv_2']],
                how = 'left',
                left_on = group_col,
                right_on = 'id_addiv'
            )
            .drop(columns=[group_col, 'id_addiv'])
            .rename(columns={'name_addiv_2': group_col})
        )
    return df_3


In [52]:
no_case_by_adh

Unnamed: 0,date_report,no_case,no_case_rollmean7d,no_case_cumsum,no_case_ppop,no_case_ppop_rollmean7d,no_case_ppop_cumsum,addr_dist_home
0,2021-05-27,0,,0,0.0,,0.0,QUAN 01
1,2021-05-27,0,,0,0.0,,0.0,QUAN 12
2,2021-05-27,0,,0,0.0,,0.0,QUAN THU DUC
3,2021-05-27,0,,0,0.0,,0.0,QUAN 09
4,2021-05-27,0,,0,0.0,,0.0,QUAN GO VAP
...,...,...,...,...,...,...,...,...
2395,2021-09-03,0,0.0,0,0.0,0.0,0.0,HUYEN CU CHI
2396,2021-09-03,0,0.0,0,0.0,0.0,0.0,HUYEN HOC MON
2397,2021-09-03,0,0.0,0,0.0,0.0,0.0,HUYEN BINH CHANH
2398,2021-09-03,0,0.0,0,0.0,0.0,0.0,HUYEN NHA BE
