In [1]:
import pandas as pd
import pathlib
import unidecode
from datetime import date

In [2]:
src = pathlib.Path().cwd().parent.parent / 'data' / 'raw' / 'test'
ref = pathlib.Path().cwd().parent.parent / 'reference'

In [3]:
def preprocess_string(row):
    s = unidecode.unidecode(row).upper().strip()
    return s

def preprocess_addr(row):
    p = row.addr_prov_home
    d = row.addr_dist_home
    w = row.addr_ward_home

In [47]:
raw = pd.read_csv(src / 'test-merge-2021-08-05.csv')

In [48]:
with open(ref / 'col-name-test.txt', 'r') as file:
    raw.columns = file.read().split('\n')
    
raw = raw[['id', 'date_sample', 'sex', 'yob', 'reason', 'result',
         'addr_prov_home', 'addr_dist_home', 'addr_ward_home',
         'ct_e', 'ct_n', 'ct_rdrp']]

In [49]:
df = raw.assign(
    sex = raw.sex.astype('str').apply(preprocess_string),
    reason = raw.reason.astype('str').apply(preprocess_string),
    result = raw.result.astype('str').apply(preprocess_string),
    addr_ward_home = raw.addr_ward_home.astype('str').apply(preprocess_string),
    addr_dist_home = raw.addr_dist_home.astype('str').apply(preprocess_string),
    addr_prov_home = raw.addr_prov_home.astype('str').apply(preprocess_string),
    date_sample = raw.date_sample.astype('str').apply(lambda x: x[0:10]),
    age = date.today().year - raw.yob
)

df['age_group'] = pd.cut(
    df.age,
    bins = [0, 17, 45, 65, 200],
    labels = ['0 - 16', '17 - 44', '45 - 64', '> 65'],
    right = False
)

df.date_sample = pd.to_datetime(
    df.date_sample, 
    format='%d/%m/%Y',
    errors='coerce')

df['positive'] = df.result == 'DUONG TINH'

In [50]:
# df

In [52]:
df['adh'] = df.addr_dist_home

df.loc[df.addr_ward_home.isin([
    'PHUONG THAO DIEN', 'PHUONG THU THIEM', 'PHUONG THANH MY LOI', 'PHUONG CAT LAI', 'PHUONG BINH TRUNG TAY',
    'PHUONG BINH TRUNG DONG', 'PHUONG BINH KHANH', 'PHUONG BINH AN', 'PHUONG AN PHU', 'PHUONG AN LOI DONG',
    'PHUONG AN KHANH'
]), 'adh'] = 'QUAN 02'

df.loc[df.addr_ward_home.isin([
    'PHUONG BINH CHIEU', 'PHUONG BINH THO', 'PHUONG HIEP BINH CHANH', 'PHUONG HIEP BINH PHUOC',
    'PHUONG LINH CHIEU', 'PHUONG LINH DONG', 'PHUONG LINH TAY', 'PHUONG LINH TRUNG', 'PHUONG LINH XUAN',
    'PHUONG TAM BINH', 'PHUONG TAM PHU', 'PHUONG TRUONG THO'
]), 'adh'] = 'QUAN THU DUC'

df.loc[df.addr_ward_home.isin([
    'PHUONG HIEP PHU', 'PHUONG LONG BINH', 'PHUONG LONG PHUOC', 'PHUONG LONG THANH MY', 'PHUONG LONG TRUONG',
    'PHUONG PHU HUU', 'PHUONG PHUOC BINH', 'PHUONG PHUOC LONG A', 'PHUONG PHUOC LONG B', 'PHUONG TAN PHU',
    'PHUONG TANG NHON PHU A', 'PHUONG TANG NHON PHU B', 'PHUONG TRUONG THANH'
]), 'adh'] = 'QUAN 09'

df.loc[df.addr_dist_home == 'QUAN 1', 'adh'] = 'QUAN 01'
df.loc[df.addr_dist_home == 'QUAN 2', 'adh'] = 'QUAN 02'
df.loc[df.addr_dist_home == 'QUAN 3', 'adh'] = 'QUAN 03'
df.loc[df.addr_dist_home == 'QUAN 4', 'adh'] = 'QUAN 04'
df.loc[df.addr_dist_home == 'QUAN 5', 'adh'] = 'QUAN 05'
df.loc[df.addr_dist_home == 'QUAN 6', 'adh'] = 'QUAN 06'
df.loc[df.addr_dist_home == 'QUAN 7', 'adh'] = 'QUAN 07'
df.loc[df.addr_dist_home == 'QUAN 8', 'adh'] = 'QUAN 08'
df.loc[df.addr_dist_home == 'QUAN 9', 'adh'] = 'QUAN 09'

df = df.drop(columns=['addr_dist_home'])
df = df.rename(columns={'adh': 'addr_dist_home'})

In [53]:
mask_addr = (
    (df.addr_prov_home != 'THANH PHO HO CHI MINH')
    | (df.addr_dist_home == 'THANH PHO DI AN')
    | (df.addr_dist_home == 'THANH PHO THUAN AN')
)

In [54]:
df.loc[mask_addr, 'addr_prov_home'] = 'KHAC'
df.loc[mask_addr, 'addr_dist_home'] = 'KHAC'

In [55]:
# df[df.addr_dist_home == 'THANH PHO THU DUC']

In [67]:
def get_no_test(data):
    df = (
        data.groupby(['date_sample'])
        .size()
        .to_frame(name='no_test')
        .reindex(pd.date_range(start=data.date_sample.min(), end=data.date_sample.max(), freq='D'))
        .fillna(0)
    )
    return df

def get_no_positive(data):
    df = (
        data.groupby(['date_sample', 'positive'])
        .apply(lambda x: len(x))
        .to_frame(name='no_positive')
        .reset_index()
        .query('positive == True')
        .set_index('date_sample')
        [['no_positive']]
        .reindex(pd.date_range(start=data.date_sample.min(), end=data.date_sample.max(), freq='D'))
        .fillna(0)
    )
    return df

def get_ct_from_test_data(data_no_test, data_no_positive,
                          bins = [0, 0.02, 0.05, 0.20, 1.01], labels = [1, 2, 3, 4], right = False,
                          rolling = 7):
    df = data_no_test.join(data_no_positive, how='left')

    df['pct_positive'] = df.no_positive / df.no_test
    df['pct_positive'] = df.pct_positive.fillna(0)
    df['no_test_rollsum'] = df.no_test.rolling(rolling).sum()
    df['no_positive_rollsum'] = df.no_positive.rolling(rolling).sum()
    df[('pct_positive_per' + str(rolling) + 'd')] = df.no_positive_rollsum / df.no_test_rollsum
    df[('pct_positive_per' + str(rolling) + 'd')] = df[('pct_positive_per' + str(rolling) + 'd')].fillna(0)

    # calculate ct
    df['ct'] = pd.cut(
        df[('pct_positive_per' + str(rolling) + 'd')],
        bins = bins,
        labels = labels,
        right = right)
    
    return df
    
def get_no_test_by_group(data, group=[]):
    df_1 = (
        data.groupby(['date_sample'] + group)
        .size()
        .to_frame(name='no_test')
        .reset_index()
    )
    
    df_pv = df_1.pivot(
        index = 'date_sample',
        columns = group,
        values = 'no_test'
    ).fillna(0)
    
    df_2 = (
        df_pv.reindex(pd.date_range(
            start=df_pv.index.min(),
            end=df_pv.index.max(),
            freq='D'))
        .fillna(0)
        .stack(list(range(0, len(group))))
        .reset_index()
        .rename(columns={
            'level_0': 'date_sample',
            0: 'no_test'
        })
        .set_index(['date_sample'] + group)
    )
    
    return df_2

def get_no_positive_by_group(data, group=[]):
    df_1 = (
        data.groupby(['date_sample', 'positive'] + group)
        .apply(lambda x: len(x))
        .to_frame(name='no_positive')
        .reset_index()
        .query('positive == True')
        .drop(columns=['positive'])
    )
    
    df_pv = df_1.pivot(
        index = 'date_sample',
        columns = group,
        values = 'no_positive'
    ).fillna(0)
    
    df_2 = (
        df_pv
        .reindex(pd.date_range(
            start=df_pv.index.min(),
            end=df_pv.index.max(),
            freq='D'))
        .fillna(0)
        .stack(list(range(0, len(group))))
        .reset_index()
        .rename(columns={
            'level_0': 'date_sample',
            0: 'no_positive'
        })
        .set_index(['date_sample'] + group)
    )
    
    return df_2

def get_ct_by_group_from_test_data(data_no_test, data_no_positive,
                          bins = [0, 0.02, 0.05, 0.20, 1.01], labels = [1, 2, 3, 4], right = False,
                          rolling = 7, group = []):
    df = data_no_test.join(data_no_positive, how = 'left').reset_index()
    df['pct_positive'] = df.no_positive / df.no_test
    df['pct_positive'] = df['pct_positive'].fillna(0)

    df['no_test_rollsum'] = (
        df[['date_sample', 'no_test'] + group]
        .groupby(group)['no_test']
        .transform(lambda x: x.rolling(rolling).sum())
    )

    df['no_positive_rollsum'] = (
        df[['date_sample', 'no_positive'] + group]
        .groupby(group)['no_positive']
        .transform(lambda x: x.rolling(rolling).sum())
    )

    df[('pct_positive_per' +str(rolling) + 'd')] = df.no_positive_rollsum / df.no_test_rollsum
    df[('pct_positive_per' +str(rolling) + 'd')] = df[('pct_positive_per' +str(rolling) + 'd')].fillna(0)

    df['ct'] = pd.cut(
        df[('pct_positive_per' +str(rolling) + 'd')],
        bins = bins,
        labels = labels,
        right = right)
    
    return df

In [68]:
data_in_get_no_test = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    [['date_sample', 'id']]
)

data_in_get_no_positive = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    [['date_sample', 'positive']]
)

no_test = get_no_test(data_in_get_no_test)
no_positive = get_no_positive(data_in_get_no_positive)
ct = get_ct_from_test_data(no_test, no_positive)

In [74]:
# no_test
# ct

In [72]:
# Get ct by addr_ward_home from test data

data_in_get_no_test_by_group_awh = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'addr_dist_home', 'addr_ward_home']]
    [(df.addr_dist_home  != 'NAN') & (df.addr_prov_home  != 'NAN') & (df.addr_ward_home  != 'NAN')]
)

# Number of test by date_sample and addr_ward_home
no_test_by_awh = get_no_test_by_group(data_in_get_no_test_by_group_awh,
                                       group = ['addr_dist_home', 'addr_ward_home'])

data_in_get_no_positive_by_group_awh = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'addr_dist_home', 'addr_ward_home', 'positive']]
    [(df.addr_dist_home  != 'NAN') & (df.addr_prov_home  != 'NAN') & (df.addr_ward_home  != 'NAN')]
)

# Number of positive result by date_sample and addr_ward_home
no_positive_by_awh = get_no_positive_by_group(data_in_get_no_positive_by_group_awh,
                                               group = ['addr_dist_home', 'addr_ward_home'])

# CT by percentage of positive per 7d by addr_ward_home
ct_by_awh = get_ct_by_group_from_test_data(no_test_by_awh, no_positive_by_awh,
                                   group = ['addr_dist_home', 'addr_ward_home'])

  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH


In [60]:
# Get ct by addr_dist_home from test data

data_in_get_no_test_by_group_adh = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'addr_dist_home']]
    [(df.addr_dist_home  != 'NAN') & (df.addr_dist_home  != 'THANH PHO THU DUC') & (df.addr_prov_home  != 'NAN')]
)

# Number of test by date_sample and addr_dist_home
no_test_by_adh = get_no_test_by_group(data_in_get_no_test_by_group_adh,
                                       group = ['addr_dist_home'])

data_in_get_no_positive_by_group_adh = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'addr_dist_home', 'positive']]
    [(df.addr_dist_home  != 'NAN') & (df.addr_dist_home  != 'THANH PHO THU DUC') & (df.addr_prov_home  != 'NAN')]
)

# Number of positive result by date_sample and addr_dist_home
no_positive_by_adh = get_no_positive_by_group(data_in_get_no_positive_by_group_adh,
                                               group = ['addr_dist_home'])

# CT by percentage of positive per 7d by addr_dist_home
ct_by_adh = get_ct_by_group_from_test_data(no_test_by_adh, no_positive_by_adh,
                                   group = ['addr_dist_home'])

  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH


In [61]:
# Get ct by age_group from test data

data_in_get_no_test_by_group_ag = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'age_group']]
    [(df.age_group  != 'NAN')]
)

# Number of test by date_sample and age_group
no_test_by_ag = get_no_test_by_group(data_in_get_no_test_by_group_ag,
                                       group = ['age_group'])

data_in_get_no_positive_by_group_ag = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
    .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'age_group', 'positive']]
    [(df.age_group  != 'NAN')]
)

# Number of positive result by date_sample and age_group
no_positive_by_ag = get_no_positive_by_group(data_in_get_no_positive_by_group_ag,
                                               group = ['age_group'])

# CT by percentage of positive per 7d by age_group
ct_by_ag = get_ct_by_group_from_test_data(no_test_by_ag, no_positive_by_ag,
                                   group = ['age_group'])

  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH


In [62]:
# Get ct by sex from test data

data_in_get_no_test_by_group_sex = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
#     .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'sex']]
    [(df.sex  != 'NAN')]
)

# Number of test by date_sample and sex
no_test_by_sex = get_no_test_by_group(data_in_get_no_test_by_group_sex,
                                       group = ['sex'])

data_in_get_no_positive_by_group_sex = (
    df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
#     .query('addr_prov_home == "THANH PHO HO CHI MINH"')
    [['date_sample', 'sex', 'positive']]
    [(df.sex  != 'NAN')]
)

# Number of positive result by date_sample and sex
no_positive_by_sex = get_no_positive_by_group(data_in_get_no_positive_by_group_sex,
                                               group = ['sex'])

# CT by percentage of positive per 7d by sex
ct_by_sex = get_ct_by_group_from_test_data(no_test_by_sex, no_positive_by_sex,
                                   group = ['sex'])

  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH
  df[~df.reason.str.startswith('KIEM DICH')] # remove reason KIEM DICH


In [77]:
# no_adh.tail()
# no_test_by_awh.tail()
# no_positive_by_awh.tail()
# ct_by_awh.tail(20)
# ct_by_adh.addr_dist_home.unique()
# ct_by_ag[ct_by_ag.age_group == '> 65'].tail(10)
# ct_by_sex.tail()
df.columns

Index(['id', 'date_sample', 'sex', 'yob', 'reason', 'result', 'addr_prov_home',
       'addr_ward_home', 'ct_e', 'ct_n', 'ct_rdrp', 'age', 'age_group',
       'positive', 'addr_dist_home'],
      dtype='object')

In [81]:
df[['sex', 'ct_e', 'ct_n', 'ct_rdrp']].groupby('sex').describe().transpose()

Unnamed: 0,sex,NAM,NU
ct_e,count,57972.0,70360.0
ct_e,mean,43.478138,45.67899
ct_e,std,213.757582,434.649149
ct_e,min,0.0,0.0
ct_e,25%,19.0,18.0
ct_e,50%,24.85,24.0
ct_e,75%,30.76,30.28
ct_e,max,3786.0,44467.0
ct_n,count,9703.0,10327.0
ct_n,mean,34.542574,34.152738
