In [1]:
import pandas as pd
import numpy as np
import pathlib

In [2]:
src = pathlib.Path().cwd().parent.parent / 'data' / 'interim'
ref = pathlib.Path().cwd().parent.parent / 'reference'

In [3]:
df = pd.read_csv(src / 'public-data.csv')
pop = pd.read_csv(ref / 'pop_1.csv', sep=',', dtype={'id_addiv': 'str'})
addiv = pd.read_csv(ref / 'addiv.csv', sep=',', dtype={'id_addiv': 'str', 'of_addiv': 'str'})

In [4]:
df['date_report'] = pd.to_datetime(df['date_report'])

In [5]:
df.head()

Unnamed: 0,name_full,sex,yob,addr_prov_home,addr_dist_home,addr_ward_home,addr_home,date_report,date_positive,place_recognize
0,BÙI THỊ THU,NU,1967.0,79,776,27427.0,"19D BINH DUC MEDIC , PHUONG 15",2021-07-10,2021-07-09,THIEU THONG TIN
1,BÙI VĂN TIẾN,NAM,1991.0,79,786,27643.0,2056/5A NHA BE,2021-07-10,2021-07-08,SANG LOC TAI BV
2,ĐẶNG HOÀNG PHƯỚC,NAM,1994.0,79,767,27028.0,307/43 THACH LAM,2021-07-10,2021-07-08,KHU CACH LY
3,ĐẶNG NGỌC,NU,1964.0,79,773,27292.0,88/38/7 NGUYEN KHOAI,2021-07-10,2021-07-08,KHU PHONG TOA
4,ĐẶNG THỊ CÚC,NU,1989.0,79,768,27073.0,102/35 HO BIEU CHANH,2021-07-11,2021-07-05,SANG LOC TAI BV


In [27]:
def get_no(data, date_col='date_report', no_col='no_case'):
    """
    Count number of (no) event by date
    
    Args:
        data: Data frame, input data frame
        date_col: String, name of date column, default 'date_report'
        no_col: String, name of event count column, default 'no_case'
        
    Return:
        A data frame
    """
    # number of events by date, reindex to fill missing date entry with 0
    idx = pd.date_range(start=data[date_col].min(), end=data[date_col].max(), freq='D')
    df = (
        data[[date_col]]
        .groupby(date_col)
        .apply(lambda x: len(x))
        .to_frame(name=no_col)
        .reindex(idx)
        .fillna(0)
        .rename_axis('date')
    )
    
    return df

def get_no_by_group(data, group_col = [], date_col='date_report', no_col='no_case'):
    """
    Count number of (no) event by group and date
    
    Args:
        data: Data frame, input data frame
        group_col: list of group columns
        date_col: String, name of date column, default date_col
        no_col: String, name of event count column, default no_col
    
    Return:
        A data frame
    """
    df = (
        data.groupby(group_col + [date_col])
        .apply(lambda x: len(x)).to_frame(name=no_col)
        .reset_index()
        .set_index(date_col)
        .groupby(group_col)
        [no_col]
        .resample('D').sum()
        .reset_index()
        .pivot(index=date_col, columns=group_col, values=no_col)
        .fillna(0)
        .stack(list(range(len(group_col))))
        .reset_index()
        .rename(columns={0: no_col, date_col: 'date'})
        .set_index('date')
    )
    return df

In [33]:
no_case = get_no(df)
no_case_by_sex = get_no_by_group(df, group_col = ['sex'])
no_case_by_adh = get_no_by_group(df, group_col = ['addr_dist_home'])

In [35]:
# no_case_by_sex
# no_case
# no_case_by_adh