In [11]:
import os
import pandas as pd

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn

%load_ext autoreload
%autoreload 2

pd.set_option("display.max_column", 200)
pd.set_option("display.max_row", 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
df_shine = pd.read_csv('../pipeline/output/shine_v3_3_kt/df_all.csv')
print(df_shine.shape, df_shine.columns)

(15376, 26) Index(['patient_id', 'selfcheck_date', 'cough', 'fever', 'sore_throat',
       'shortness_of_breath', 'head_ache', 'runny_nose', 'muscle_pain',
       'chills', 'loss_of_taste', 'loss_of_smell', 'sputum', 'chest_pain',
       'indication_other', 'indication_abroad', 'indication_contact',
       'global_confirmed_ratio', 'confirmed_ratio', 'sigungu_confirmed_ratio',
       'mask', 'gender', 'age_ratio', 'weekday', 'pcr_result', 'split'],
      dtype='object')


In [13]:
selfcheck_col = 'selfcheck_date'
print(df_shine[selfcheck_col].min(), df_shine[selfcheck_col].max())

2021-10-21 2023-03-09


## Load owid data

In [14]:
df_owid = pd.read_csv('./opendataset/owid-data-new.csv')
df_owid.shape

(291946, 67)

In [15]:
date_col_on_owid = 'date'
df_owid[date_col_on_owid].min(), df_owid[date_col_on_owid].max()

('2020-01-01', '2023-03-06')

In [16]:
# Filter same date of shine data
df_owid = df_owid[df_owid[date_col_on_owid] < df_shine[selfcheck_col].max()]
df_owid.shape

(291946, 67)

## Add special features

In [22]:
df_owid.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [25]:
from datetime import timedelta

df_owid['date_index'] = df_owid['date'] + "__" + df_owid['iso_code']
df_owid['date_index'].sample(10, random_state=1212)

145355    2020-01-23__LTU
76962     2022-05-14__ERI
89023     2020-08-05__GUF
189652    2022-05-19__MKD
197413    2022-12-08__PAK
266355    2022-05-05__TUR
264128    2022-07-23__TTO
693       2021-11-26__AFG
51520     2022-01-07__CHN
221938    2020-07-25__SPM
Name: date_index, dtype: object

In [56]:
df_owid['date_index_6months_ago'] = (pd.to_datetime(df_owid['date']) - timedelta(days=182)).dt.strftime("%Y-%m-%d")
df_owid['date_index_6months_ago'] = df_owid['date_index_6months_ago'] + "__" + df_owid['iso_code']
df_owid['date_index_6months_ago'].sample(10, random_state=1212)

145355    2019-07-25__LTU
76962     2021-11-13__ERI
89023     2020-02-05__GUF
189652    2021-11-18__MKD
197413    2022-06-09__PAK
266355    2021-11-04__TUR
264128    2022-01-22__TTO
693       2021-05-28__AFG
51520     2021-07-09__CHN
221938    2020-01-25__SPM
Name: date_index_6months_ago, dtype: object

In [None]:
def get_6months_ago_values(df, target_col):
    return pd.merge(
        df[['date_index', 'date_index_6months_ago', target_col]], 
        df[['date_index', target_col]],
        how='left',
        left_on='date_index_6months_ago', 
        right_on='date_index',
        suffixes=("", '_6months_ago'),
    )[f'{target_col}_6months_ago']


### 6개월 동안 확진자 수

In [64]:

df_owid['total_cases_per_million_6months_ago'] = get_6months_ago_values(df_owid, 'total_cases_per_million')
df_owid['total_cases_per_million_6months_ago'].sample(10, random_state=1212)

145355           NaN
76962       1910.674
89023            NaN
189652    100753.437
197413      6491.589
266355     95161.791
264128     67930.816
693         1680.818
51520         83.556
221938           NaN
Name: total_cases_per_million_6months_ago, dtype: float64

In [65]:
df_owid['total_cases_per_million_for_6months'] = df_owid['total_cases_per_million'] - df_owid['total_cases_per_million_6months_ago']
df_owid[
    [
        'date_index',
        'date_index_6months_ago',
        'total_cases_per_million', 
        'total_cases_per_million_6months_ago', 
        'total_cases_per_million_for_6months'
    ]
].sample(10, random_state=1212)

Unnamed: 0,date_index,date_index_6months_ago,total_cases_per_million,total_cases_per_million_6months_ago,total_cases_per_million_for_6months
145355,2020-01-23__LTU,2019-07-25__LTU,,,
76962,2022-05-14__ERI,2021-11-13__ERI,2645.193,1910.674,734.519
89023,2020-08-05__GUF,2020-02-05__GUF,26260.146,,
189652,2022-05-19__MKD,2021-11-18__MKD,148601.504,100753.437,47848.067
197413,2022-12-08__PAK,2022-06-09__PAK,6680.305,6491.589,188.716
266355,2022-05-05__TUR,2021-11-04__TUR,176201.337,95161.791,81039.546
264128,2022-07-23__TTO,2022-01-22__TTO,110908.054,67930.816,42977.238
693,2021-11-26__AFG,2021-05-28__AFG,3820.78,1680.818,2139.962
51520,2022-01-07__CHN,2021-07-09__CHN,93.559,83.556,10.003
221938,2020-07-25__SPM,2020-01-25__SPM,679.694,,


### 6개월 동안 백명당 백신접종자 수

In [66]:

df_owid['total_vaccinations_per_hundred_6months_ago'] = get_6months_ago_values(df_owid, 'total_vaccinations_per_hundred')
df_owid['total_vaccinations_per_hundred_6months_ago'].sample(10, random_state=1212)

145355       NaN
76962        NaN
89023        NaN
189652       NaN
197413       NaN
266355    137.11
264128     95.77
693          NaN
51520      95.76
221938       NaN
Name: total_vaccinations_per_hundred_6months_ago, dtype: float64

In [67]:
df_owid['total_vaccinations_per_hundred_for_6months'] = df_owid['total_vaccinations_per_hundred'] - df_owid['total_vaccinations_per_hundred_6months_ago']
df_owid[
    [
        'date_index',
        'date_index_6months_ago',
        'total_vaccinations_per_hundred', 
        'total_vaccinations_per_hundred_6months_ago', 
        'total_vaccinations_per_hundred_for_6months'
    ]
].sample(10, random_state=1212)

Unnamed: 0,date_index,date_index_6months_ago,total_vaccinations_per_hundred,total_vaccinations_per_hundred_6months_ago,total_vaccinations_per_hundred_for_6months
145355,2020-01-23__LTU,2019-07-25__LTU,,,
76962,2022-05-14__ERI,2021-11-13__ERI,,,
89023,2020-08-05__GUF,2020-02-05__GUF,,,
189652,2022-05-19__MKD,2021-11-18__MKD,,,
197413,2022-12-08__PAK,2022-06-09__PAK,,,
266355,2022-05-05__TUR,2021-11-04__TUR,172.86,137.11,35.75
264128,2022-07-23__TTO,2022-01-22__TTO,102.97,95.77,7.2
693,2021-11-26__AFG,2021-05-28__AFG,,,
51520,2022-01-07__CHN,2021-07-09__CHN,202.52,95.76,106.76
221938,2020-07-25__SPM,2020-01-25__SPM,,,


### 6개월동안 사망자 수

In [68]:
df_owid['total_deaths_per_million_6months_ago'] = get_6months_ago_values(df_owid, 'total_deaths_per_million')
df_owid['total_deaths_per_million_6months_ago'].sample(10, random_state=1212)

145355         NaN
76962       13.029
89023          NaN
189652    3532.183
197413     128.820
266355     833.091
264128    2126.655
693         70.048
51520        3.904
221938         NaN
Name: total_deaths_per_million_6months_ago, dtype: float64

In [69]:
df_owid['total_deaths_per_hundred_for_6months'] = df_owid['total_vaccinations_per_hundred'] - df_owid['total_vaccinations_per_hundred_6months_ago']
df_owid[
    [
        'date_index',
        'date_index_6months_ago',
        'total_deaths_per_million', 
        'total_deaths_per_million_6months_ago', 
        'total_deaths_per_hundred_for_6months'
    ]
].sample(10, random_state=1212)

Unnamed: 0,date_index,date_index_6months_ago,total_deaths_per_million,total_deaths_per_million_6months_ago,total_deaths_per_hundred_for_6months
145355,2020-01-23__LTU,2019-07-25__LTU,,,
76962,2022-05-14__ERI,2021-11-13__ERI,27.958,13.029,
89023,2020-08-05__GUF,2020-02-05__GUF,147.75,,
189652,2022-05-19__MKD,2021-11-18__MKD,4442.097,3532.183,
197413,2022-12-08__PAK,2022-06-09__PAK,129.901,128.82,
266355,2022-05-05__TUR,2021-11-04__TUR,1158.162,833.091,35.75
264128,2022-07-23__TTO,2022-01-22__TTO,2639.377,2126.655,7.2
693,2021-11-26__AFG,2021-05-28__AFG,177.662,70.048,
51520,2022-01-07__CHN,2021-07-09__CHN,3.997,3.904,106.76
221938,2020-07-25__SPM,2020-01-25__SPM,,,


# 한국 수치 확인

In [72]:
df_kr_tmp = df_owid[df_owid['iso_code'] == 'KOR']
df_kr_tmp[[
    'date_index', 
    'total_vaccinations_per_hundred', 
    'total_vaccinations_per_hundred_6months_ago', 
    'total_vaccinations_per_hundred_for_6months',
            'total_deaths_per_million', 
        'total_deaths_per_million_6months_ago', 
        'total_deaths_per_hundred_for_6months',
            'total_vaccinations_per_hundred', 
        'total_vaccinations_per_hundred_6months_ago', 
        'total_vaccinations_per_hundred_for_6months',
]].sample(10, random_state=1).sort_values('date_index')

Unnamed: 0,date_index,total_vaccinations_per_hundred,total_vaccinations_per_hundred_6months_ago,total_vaccinations_per_hundred_for_6months,total_deaths_per_million,total_deaths_per_million_6months_ago,total_deaths_per_hundred_for_6months,total_vaccinations_per_hundred.1,total_vaccinations_per_hundred_6months_ago.1,total_vaccinations_per_hundred_for_6months.1
243669,2020-03-24__KOR,,,,2.316,,,,,
243790,2020-07-23__KOR,,,,5.732,,,,,
243808,2020-08-10__KOR,,,,5.886,,,,,
244025,2021-03-15__KOR,1.26,,,32.326,7.006,,1.26,,
244141,2021-07-09__KOR,39.69,,,39.293,20.862,,39.69,,
244350,2022-02-03__KOR,221.97,52.68,169.29,131.466,40.702,169.29,221.97,52.68,169.29
244360,2022-02-13__KOR,225.91,60.59,165.32,136.657,41.609,165.32,225.91,60.59,165.32
244418,2022-04-12__KOR,234.2,135.62,98.58,383.088,50.062,98.58,234.2,135.62,98.58
244518,2022-07-21__KOR,244.46,215.19,29.27,478.503,125.058,29.27,244.46,215.19,29.27
244560,2022-09-01__KOR,249.04,230.69,18.35,518.683,161.997,18.35,249.04,230.69,18.35


## Preprocess owid data

In [73]:
per_pop_cols = [x for x in df_owid.columns if "per_million" in x or 'per_hundred' in x or 'per_thousand' in x]
print(per_pop_cols)
df_owid_target = df_owid[['iso_code', 'date'] + per_pop_cols]
df_owid_target.shape

['total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'icu_patients_per_million', 'hosp_patients_per_million', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions_per_million', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed_per_thousand', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed_per_hundred', 'hospital_beds_per_thousand', 'excess_mortality_cumulative_per_million', 'total_cases_per_million_6months_ago', 'total_vaccinations_per_hundred_6months_ago', 'total_cases_for_6months_per_million', 'total_vaccinations_for_6months_per_hundred', 'total_deaths_per_million_6months_ago', 'total_cases_per_million_for_6months', 'total_vaccinations_per_hundred_for_6months', 'total_death

(291946, 31)

In [74]:
df_owid_nm = df_owid_target[['iso_code', 'date']].copy()
dict_max_vals = dict()
for col in per_pop_cols:
    norm_col = col+'_norm'
    max_val = df_owid_target[col].max()        
    df_owid_nm[norm_col] = df_owid_target[col] / max_val
    print(f"max value of {col}: {max_val}")
    dict_max_vals[col] = max_val
    
    col_mask = col + '_mask'
    df_owid_nm[col_mask] = df_owid_nm[norm_col].isnull().astype(int)
    df_owid_nm[norm_col] = df_owid_nm[norm_col].fillna(0)
    
df_owid_nm.sample(10)

max value of total_cases_per_million: 724428.492
max value of new_cases_per_million: 228872.025
max value of new_cases_smoothed_per_million: 37241.781
max value of total_deaths_per_million: 6444.395
max value of new_deaths_per_million: 603.656
max value of new_deaths_smoothed_per_million: 148.641
max value of icu_patients_per_million: 180.675
max value of hosp_patients_per_million: 1526.846
max value of weekly_icu_admissions_per_million: 222.9
max value of weekly_hosp_admissions_per_million: 706.023
max value of total_tests_per_thousand: 32925.826
max value of new_tests_per_thousand: 531.062
max value of new_tests_smoothed_per_thousand: 147.603
max value of total_vaccinations_per_hundred: 406.43
max value of people_vaccinated_per_hundred: 129.07
max value of people_fully_vaccinated_per_hundred: 126.89
max value of total_boosters_per_hundred: 150.47
max value of new_vaccinations_smoothed_per_million: 117113.0
max value of new_people_vaccinated_smoothed_per_hundred: 11.711
max value of h

Unnamed: 0,iso_code,date,total_cases_per_million_norm,total_cases_per_million_mask,new_cases_per_million_norm,new_cases_per_million_mask,new_cases_smoothed_per_million_norm,new_cases_smoothed_per_million_mask,total_deaths_per_million_norm,total_deaths_per_million_mask,new_deaths_per_million_norm,new_deaths_per_million_mask,new_deaths_smoothed_per_million_norm,new_deaths_smoothed_per_million_mask,icu_patients_per_million_norm,icu_patients_per_million_mask,hosp_patients_per_million_norm,hosp_patients_per_million_mask,weekly_icu_admissions_per_million_norm,weekly_icu_admissions_per_million_mask,weekly_hosp_admissions_per_million_norm,weekly_hosp_admissions_per_million_mask,total_tests_per_thousand_norm,total_tests_per_thousand_mask,new_tests_per_thousand_norm,new_tests_per_thousand_mask,new_tests_smoothed_per_thousand_norm,new_tests_smoothed_per_thousand_mask,total_vaccinations_per_hundred_norm,total_vaccinations_per_hundred_mask,people_vaccinated_per_hundred_norm,people_vaccinated_per_hundred_mask,people_fully_vaccinated_per_hundred_norm,people_fully_vaccinated_per_hundred_mask,total_boosters_per_hundred_norm,total_boosters_per_hundred_mask,new_vaccinations_smoothed_per_million_norm,new_vaccinations_smoothed_per_million_mask,new_people_vaccinated_smoothed_per_hundred_norm,new_people_vaccinated_smoothed_per_hundred_mask,hospital_beds_per_thousand_norm,hospital_beds_per_thousand_mask,excess_mortality_cumulative_per_million_norm,excess_mortality_cumulative_per_million_mask,total_cases_per_million_6months_ago_norm,total_cases_per_million_6months_ago_mask,total_vaccinations_per_hundred_6months_ago_norm,total_vaccinations_per_hundred_6months_ago_mask,total_cases_for_6months_per_million_norm,total_cases_for_6months_per_million_mask,total_vaccinations_for_6months_per_hundred_norm,total_vaccinations_for_6months_per_hundred_mask,total_deaths_per_million_6months_ago_norm,total_deaths_per_million_6months_ago_mask,total_cases_per_million_for_6months_norm,total_cases_per_million_for_6months_mask,total_vaccinations_per_hundred_for_6months_norm,total_vaccinations_per_hundred_for_6months_mask,total_deaths_per_hundred_for_6months_norm,total_deaths_per_hundred_for_6months_mask
176355,NRU,2020-10-01,0.0,1,0.0,0,0.0,0,0.0,1,0.0,0,0.0,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.362319,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1
162238,MUS,2023-02-25,0.313838,0,0.0,0,0.0,0,0.124666,0,0.0,0,0.0,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.000102,0,0.0,0,0.246377,0,0.0,1,0.304226,0,0.0,1,0.045535,0,0.0,1,0.124219,0,0.045535,0,0.0,1,0.0,1
141752,LSO,2022-10-28,0.020648,0,0.0,0,0.0,0,0.047511,0,0.0,0,0.0,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.004722,0,0.002135,0,0.0,1,0.0,1,0.022023,0,0.0,1,0.000928,0,0.0,1,0.047697,0,0.000928,0,0.0,1,0.0,1
250907,SWE,2021-02-02,0.075229,0,0.000482,0,0.007511,0,0.17988,0,0.006124,0,0.035441,0,0.125917,0,0.108771,0,0.0,1,0.0,1,0.014925,0,0.005223,0,0.018502,0,0.0,1,0.0,1,0.0,1,0.0,1,0.010887,0,0.006319,0,0.16087,0,0.0,1,0.011228,0,0.0,1,0.074453,0,0.0,1,0.085646,0,0.074453,0,0.0,1,0.0,1
113061,HKG,2020-02-19,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.00084,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1
24981,BEL,2022-01-09,0.268635,0,0.008924,0,0.046866,0,0.379842,0,0.0027,0,0.010805,0,0.209885,0,0.10603,0,0.0,1,0.156148,0,0.073328,0,0.007449,0,0.05025,0,0.477352,0,0.596421,0,0.598235,0,0.2978,0,0.092287,0,0.003159,0,0.408696,0,0.170818,0,0.14313,0,0.28657,0,0.159753,0,0.374719,0,0.343131,0,0.159753,0,0.374719,0,0.374719,0
44496,CAN,2021-09-29,0.057879,0,0.000486,0,0.003038,0,0.110659,0,0.001981,0,0.006573,0,0.109822,0,0.042324,0,0.0,1,0.0,1,0.034399,0,0.004476,0,0.019126,0,0.360062,0,0.586194,0,0.553235,0,0.00319,0,0.022952,0,0.00871,0,0.181159,0,0.0,1,0.038728,0,0.040714,0,0.026295,0,0.557745,0,0.093145,0,0.026295,0,0.557745,0,0.557745,0
36037,VGB,2020-09-12,0.00282,0,0.000139,0,0.000122,0,0.004953,0,0.0,0,0.0,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1
234938,SGP,2021-08-05,0.016018,0,7.4e-05,0,0.000559,0,0.001074,0,0.000293,0,0.000343,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.347957,0,0.603161,0,0.495705,0,0.0,1,0.104608,0,0.014687,0,0.173913,0,0.0,1,0.016205,0,0.009887,0,0.001626,0,0.585185,0,0.0,1,0.001626,0,0.585185,0,0.585185,0
254394,TWN,2021-03-06,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.000226,0,2.8e-05,0,0.000122,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1


In [75]:
df_kr = df_owid_nm[df_owid_nm['iso_code'] == 'KOR']
df_kr = df_kr.sort_values('date')
print(df_kr.shape)
df_kr.sample(10)

(1159, 60)


Unnamed: 0,iso_code,date,total_cases_per_million_norm,total_cases_per_million_mask,new_cases_per_million_norm,new_cases_per_million_mask,new_cases_smoothed_per_million_norm,new_cases_smoothed_per_million_mask,total_deaths_per_million_norm,total_deaths_per_million_mask,new_deaths_per_million_norm,new_deaths_per_million_mask,new_deaths_smoothed_per_million_norm,new_deaths_smoothed_per_million_mask,icu_patients_per_million_norm,icu_patients_per_million_mask,hosp_patients_per_million_norm,hosp_patients_per_million_mask,weekly_icu_admissions_per_million_norm,weekly_icu_admissions_per_million_mask,weekly_hosp_admissions_per_million_norm,weekly_hosp_admissions_per_million_mask,total_tests_per_thousand_norm,total_tests_per_thousand_mask,new_tests_per_thousand_norm,new_tests_per_thousand_mask,new_tests_smoothed_per_thousand_norm,new_tests_smoothed_per_thousand_mask,total_vaccinations_per_hundred_norm,total_vaccinations_per_hundred_mask,people_vaccinated_per_hundred_norm,people_vaccinated_per_hundred_mask,people_fully_vaccinated_per_hundred_norm,people_fully_vaccinated_per_hundred_mask,total_boosters_per_hundred_norm,total_boosters_per_hundred_mask,new_vaccinations_smoothed_per_million_norm,new_vaccinations_smoothed_per_million_mask,new_people_vaccinated_smoothed_per_hundred_norm,new_people_vaccinated_smoothed_per_hundred_mask,hospital_beds_per_thousand_norm,hospital_beds_per_thousand_mask,excess_mortality_cumulative_per_million_norm,excess_mortality_cumulative_per_million_mask,total_cases_per_million_6months_ago_norm,total_cases_per_million_6months_ago_mask,total_vaccinations_per_hundred_6months_ago_norm,total_vaccinations_per_hundred_6months_ago_mask,total_cases_for_6months_per_million_norm,total_cases_for_6months_per_million_mask,total_vaccinations_for_6months_per_hundred_norm,total_vaccinations_for_6months_per_hundred_mask,total_deaths_per_million_6months_ago_norm,total_deaths_per_million_6months_ago_mask,total_cases_per_million_for_6months_norm,total_cases_per_million_for_6months_mask,total_vaccinations_per_hundred_for_6months_norm,total_vaccinations_per_hundred_for_6months_mask,total_deaths_per_hundred_for_6months_norm,total_deaths_per_hundred_for_6months_mask
243837,KOR,2020-09-08,0.000571,0,1.146929e-05,0,9.3e-05,0,0.001021,0,0.000159,0,0.000316,0,0.016023,0,0.0,1,0.0,1,0.0,1,0.00121,0,0.000537,0,0.001999,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.88913,0,0.0,1,0.000222,0,0.0,1,0.000424,0,0.0,1,0.000164,0,0.000424,0,0.0,1,0.0,1
244710,KOR,2023-01-29,0.8032,0,0.001591256,0,0.01111,0,0.099994,0,0.000928,0,0.003357,0,0.044865,0,0.0,1,0.0,1,0.017413,0,0.0,1,0.0,1,0.0,1,0.0,1,0.670256,0,0.675467,0,0.0,1,0.0,1,0.0,0,0.88913,0,0.0,1,0.584935,0,0.665592,0,0.315982,0,0.0,1,0.076274,0,0.315982,0,0.0,1,0.0,1
244558,KOR,2022-08-30,0.616526,0,0.009747377,0,0.051313,0,0.079926,0,0.00227,0,0.009795,0,0.06313,0,0.043279,0,0.051691,0,0.06596,0,0.0,1,0.0,1,0.0,1,0.612529,0,0.669792,0,0.674836,0,0.522297,0,0.005038,0,8.5e-05,0,0.88913,0,0.0,1,0.096822,0,0.624038,0,0.605216,0,0.078919,0,0.024879,0,0.605216,0,0.078919,0,0.078919,0
244661,KOR,2022-12-11,0.7387,0,0.004580328,0,0.031122,0,0.093043,0,0.003005,0,0.006304,0,0.047002,0,0.0,1,0.0,1,0.036711,0,0.0,1,0.0,1,0.0,1,0.615629,0,0.670101,0,0.675309,0,0.530006,0,0.000299,0,0.0,0,0.88913,0,0.0,1,0.539071,0,0.657709,0,0.289465,0,0.031474,0,0.074215,0,0.289465,0,0.031474,0,0.031474,0
244202,KOR,2021-09-08,0.007071,0,0.0001726948,0,0.000887,0,0.00699,0,0.000128,0,0.00078,0,0.041339,0,0.0,1,0.0,1,0.0,1,0.015246,0,0.005491,0,0.020731,0,0.237212,0,0.473464,0,0.290724,0,0.0,0,0.118057,0,0.053027,0,0.88913,0,0.0,1,0.002772,0,0.002844,0,0.00523,0,0.405046,0,0.005019,0,0.00523,0,0.405046,0,0.405046,0
244506,KOR,2022-07-09,0.492621,0,0.001708632,0,0.008283,0,0.073742,0,0.000608,0,0.00115,0,0.006514,0,0.0,1,0.0,1,0.021158,0,0.0,1,0.0,1,0.0,1,0.59907,0,0.669482,0,0.674521,0,0.486542,0,0.002425,0,8.5e-05,0,0.88913,0,0.0,1,0.019551,0,0.560272,0,0.54312,0,0.155673,0,0.018229,0,0.54312,0,0.155673,0,0.155673,0
243698,KOR,2020-04-22,0.000285,0,9.262818e-07,0,8e-06,0,0.000713,0,3.1e-05,0,0.000242,0,0.005557,0,0.0,1,0.0,1,0.0,1,0.000338,0,0.000252,0,0.000813,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.88913,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1
244736,KOR,2023-02-24,0.811996,0,0.0008475304,0,0.005263,0,0.101608,0,0.000639,0,0.001796,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.670334,0,0.675546,0,0.0,1,0.0,1,0.0,0,0.88913,0,0.0,1,0.674465,0,0.673963,0,0.233837,0,0.0,1,0.080433,0,0.233837,0,0.0,1,0.0,1
243853,KOR,2020-09-24,0.000622,0,1.053864e-05,0,5.1e-05,0,0.001177,0,0.000159,0,0.00039,0,0.013461,0,0.0,1,0.0,1,0.0,1,0.001329,0,0.000439,0,0.001443,0,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.0,1,0.88913,0,0.0,1,0.000273,0,0.0,1,0.000429,0,0.0,1,0.000399,0,0.000429,0,0.0,1,0.0,1
244351,KOR,2022-02-04,0.0249,0,0.002313564,0,0.010443,0,0.020472,0,0.000767,0,0.002933,0,0.027453,0,0.0,1,0.0,1,0.216166,0,0.030847,0,0.007063,0,0.031334,0,0.548089,0,0.664523,0,0.667271,0,0.359274,0,0.029168,0,0.000854,0,0.88913,0,0.0,1,0.006135,0,0.144869,0,0.022152,0,0.719025,0,0.006435,0,0.022152,0,0.719025,0,0.719025,0


In [76]:
df_kr['date'].nunique()

1159

In [77]:
df_kr['date'].min(), df_kr['date'].max()

('2020-01-03', '2023-03-06')

In [78]:
df_new = pd.merge(df_shine, df_kr, how='left', left_on=selfcheck_col, right_on='date')
df_new.shape

(15376, 86)

In [79]:
df_new.sample(10)

Unnamed: 0,patient_id,selfcheck_date,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday,pcr_result,split,iso_code,date,total_cases_per_million_norm,total_cases_per_million_mask,new_cases_per_million_norm,new_cases_per_million_mask,new_cases_smoothed_per_million_norm,new_cases_smoothed_per_million_mask,total_deaths_per_million_norm,total_deaths_per_million_mask,new_deaths_per_million_norm,new_deaths_per_million_mask,new_deaths_smoothed_per_million_norm,new_deaths_smoothed_per_million_mask,icu_patients_per_million_norm,icu_patients_per_million_mask,hosp_patients_per_million_norm,hosp_patients_per_million_mask,weekly_icu_admissions_per_million_norm,weekly_icu_admissions_per_million_mask,weekly_hosp_admissions_per_million_norm,weekly_hosp_admissions_per_million_mask,total_tests_per_thousand_norm,total_tests_per_thousand_mask,new_tests_per_thousand_norm,new_tests_per_thousand_mask,new_tests_smoothed_per_thousand_norm,new_tests_smoothed_per_thousand_mask,total_vaccinations_per_hundred_norm,total_vaccinations_per_hundred_mask,people_vaccinated_per_hundred_norm,people_vaccinated_per_hundred_mask,people_fully_vaccinated_per_hundred_norm,people_fully_vaccinated_per_hundred_mask,total_boosters_per_hundred_norm,total_boosters_per_hundred_mask,new_vaccinations_smoothed_per_million_norm,new_vaccinations_smoothed_per_million_mask,new_people_vaccinated_smoothed_per_hundred_norm,new_people_vaccinated_smoothed_per_hundred_mask,hospital_beds_per_thousand_norm,hospital_beds_per_thousand_mask,excess_mortality_cumulative_per_million_norm,excess_mortality_cumulative_per_million_mask,total_cases_per_million_6months_ago_norm,total_cases_per_million_6months_ago_mask,total_vaccinations_per_hundred_6months_ago_norm,total_vaccinations_per_hundred_6months_ago_mask,total_cases_for_6months_per_million_norm,total_cases_for_6months_per_million_mask,total_vaccinations_for_6months_per_hundred_norm,total_vaccinations_for_6months_per_hundred_mask,total_deaths_per_million_6months_ago_norm,total_deaths_per_million_6months_ago_mask,total_cases_per_million_for_6months_norm,total_cases_per_million_for_6months_mask,total_vaccinations_per_hundred_for_6months_norm,total_vaccinations_per_hundred_for_6months_mask,total_deaths_per_hundred_for_6months_norm,total_deaths_per_hundred_for_6months_mask
11713,7484,2022-08-16,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.155895,0.192499,0.204133,1,0.0,0.33,5,0,VALIDATE,KOR,2022-08-16,0.572824,0.0,0.007081,0.0,0.059804,0.0,0.076994,0.0,0.001183,0.0,0.00701,0.0,0.060136,0.0,0.048803,0.0,0.038964,0.0,0.102533,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.609921,0.0,0.669714,0.0,0.674758,0.0,0.515385,0.0,0.008112,0.0,8.5e-05,0.0,0.88913,0.0,0.0,1.0,0.043255,0.0,0.613934,0.0,0.610413,0.0,0.09026,0.0,0.021813,0.0,0.610413,0.0,0.09026,0.0,0.09026,0.0
13539,38735,2022-03-25,1,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0.436953,0.546253,0.110339,1,0.0,0.18,3,1,TEST,KOR,2022-03-25,0.297367,0.0,0.028629,0.0,0.185418,0.0,0.042807,0.0,0.012532,0.0,0.046595,0.0,0.115899,0.0,0.0,1.0,0.0,1.0,0.341414,0.0,0.049505,0.0,0.021472,0.0,0.07281,0.0,0.573826,0.0,0.667855,0.0,0.672315,0.0,0.421612,0.0,0.007719,0.0,0.000256,0.0,0.88913,0.0,0.0,1.0,0.008729,0.0,0.312195,0.0,0.331015,0.0,0.501083,0.0,0.007433,0.0,0.331015,0.0,0.501083,0.0,0.501083,0.0
2900,41226,2022-03-20,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.5483,1.0,0.081375,1,1.0,0.35,2,0,TRAIN,KOR,2022-03-20,0.249719,0.0,0.028224,0.0,0.208559,0.0,0.037218,0.0,0.010455,0.0,0.037708,0.0,0.110342,0.0,0.0,1.0,0.0,1.0,0.347646,0.0,0.047791,0.0,0.013814,0.0,0.085161,0.0,0.572448,0.0,0.6677,0.0,0.672157,0.0,0.418156,0.0,0.009017,0.0,0.000427,0.0,0.88913,0.0,0.040512,0.0,0.008457,0.0,0.301387,0.0,0.276815,0.0,0.515652,0.0,0.007321,0.0,0.276815,0.0,0.515652,0.0,0.515652,0.0
8971,29434,2023-01-09,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.085469,0.096635,0.303237,1,0.0,0.31,0,0,TRAIN,KOR,2023-01-09,0.786952,0.0,0.001611,0.0,0.029625,0.0,0.097703,0.0,0.001118,0.0,0.006546,0.0,0.056826,0.0,0.0,1.0,0.0,1.0,0.030724,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.670179,0.0,0.675388,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.88913,0.0,0.0,1.0,0.547916,0.0,0.659633,0.0,0.335527,0.0,0.0,1.0,0.075098,0.0,0.335527,0.0,0.0,1.0,0.0,1.0
1198,34968,2022-03-11,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.43475,0.429472,0.03793,1,1.0,0.64,3,1,TRAIN,KOR,2022-03-11,0.155117,0.0,0.023861,0.0,0.138014,0.0,0.029573,0.0,0.007322,0.0,0.024018,0.0,0.119209,0.0,0.0,1.0,0.0,1.0,0.35582,0.0,0.044127,0.0,0.041937,0.0,0.107674,0.0,0.570381,0.0,0.667312,0.0,0.671763,0.0,0.413239,0.0,0.010716,0.0,0.000769,0.0,0.88913,0.0,0.0,1.0,0.007967,0.0,0.270479,0.0,0.169154,0.0,0.560549,0.0,0.00715,0.0,0.169154,0.0,0.560549,0.0,0.560549,0.0
4203,47311,2022-03-29,1,1,1,0,1,1,1,1,1,1,1,0,1,0,0,0.436953,0.546253,0.109542,1,1.0,0.31,3,1,TRAIN,KOR,2022-03-29,0.329018,0.0,0.029297,0.0,0.178705,0.0,0.046187,0.0,0.007577,0.0,0.04233,0.0,0.12978,0.0,0.0,1.0,0.0,1.0,0.342562,0.0,0.050615,0.0,0.025334,0.0,0.067451,0.0,0.574392,0.0,0.667855,0.0,0.672472,0.0,0.423008,0.0,0.007232,0.0,0.000256,0.0,0.88913,0.0,0.0,1.0,0.009046,0.0,0.326065,0.0,0.366879,0.0,0.480313,0.0,0.007503,0.0,0.366879,0.0,0.480313,0.0,0.480313,0.0
13310,42034,2022-03-21,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0.5483,1.0,0.066961,1,0.0,0.35,2,1,TEST,KOR,2022-03-21,0.255284,0.0,0.017616,0.0,0.201092,0.0,0.038204,0.0,0.010518,0.0,0.040103,0.0,0.120703,0.0,0.287115,0.0,0.084419,0.0,0.343792,0.0,0.048019,0.0,0.014147,0.0,0.081889,0.0,0.572719,0.0,0.6677,0.0,0.672236,0.0,0.418887,0.0,0.008846,0.0,0.000342,0.0,0.88913,0.0,0.0,1.0,0.008505,0.0,0.301631,0.0,0.28313,0.0,0.515737,0.0,0.007336,0.0,0.28313,0.0,0.515737,0.0,0.515737,0.0
4609,33059,2022-03-31,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0.430921,0.539948,0.130209,1,1.0,0.34,4,1,TRAIN,KOR,2022-03-31,0.348872,0.0,0.027039,0.0,0.168258,0.0,0.048604,0.0,0.011989,0.0,0.043178,0.0,0.140462,0.0,0.0,1.0,0.0,1.0,0.332831,0.0,0.051141,0.0,0.015011,0.0,0.063183,0.0,0.574785,0.0,0.66801,0.0,0.672551,0.0,0.423872,0.0,0.006788,0.0,0.000342,0.0,0.88913,0.0,0.0,1.0,0.009207,0.0,0.334652,0.0,0.389413,0.0,0.467528,0.0,0.007555,0.0,0.389413,0.0,0.467528,0.0,0.467528,0.0
12792,24119,2022-03-13,1,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0.458205,0.455445,0.062425,1,1.0,0.32,3,0,TEST,KOR,2022-03-13,0.174667,0.0,0.029527,0.0,0.155476,0.0,0.03113,0.0,0.008024,0.0,0.026675,0.0,0.11472,0.0,0.0,1.0,0.0,1.0,0.364212,0.0,0.045118,0.0,0.025583,0.0,0.113114,0.0,0.570627,0.0,0.66739,0.0,0.671842,0.0,0.41377,0.0,0.010323,0.0,0.000769,0.0,0.88913,0.0,0.032569,0.0,0.008074,0.0,0.273513,0.0,0.191396,0.0,0.556216,0.0,0.007184,0.0,0.191396,0.0,0.556216,0.0,0.556216,0.0
5428,23531,2022-04-07,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.313069,0.330409,0.139433,1,1.0,0.42,3,0,TRAIN,KOR,2022-04-07,0.393703,0.0,0.018954,0.0,0.124579,0.0,0.055046,0.0,0.011126,0.0,0.039895,0.0,0.119209,0.0,0.222159,0.0,0.0,1.0,0.294754,0.0,0.052651,0.0,0.011227,0.0,0.048116,0.0,0.575745,0.0,0.668397,0.0,0.672787,0.0,0.425932,0.0,0.00473,0.0,0.000598,0.0,0.88913,0.0,0.0,1.0,0.009636,0.0,0.355591,0.0,0.44023,0.0,0.436351,0.0,0.007747,0.0,0.44023,0.0,0.436351,0.0,0.436351,0.0


In [80]:
save_filepath = '../pipeline/output/shine_v3_3_kt/df_all_added_owid.csv'
df_new.to_csv(save_filepath, index=False)


In [81]:
df_load = pd.read_csv(save_filepath)
df_load.sample(10)

Unnamed: 0,patient_id,selfcheck_date,cough,fever,sore_throat,shortness_of_breath,head_ache,runny_nose,muscle_pain,chills,loss_of_taste,loss_of_smell,sputum,chest_pain,indication_other,indication_abroad,indication_contact,global_confirmed_ratio,confirmed_ratio,sigungu_confirmed_ratio,mask,gender,age_ratio,weekday,pcr_result,split,iso_code,date,total_cases_per_million_norm,total_cases_per_million_mask,new_cases_per_million_norm,new_cases_per_million_mask,new_cases_smoothed_per_million_norm,new_cases_smoothed_per_million_mask,total_deaths_per_million_norm,total_deaths_per_million_mask,new_deaths_per_million_norm,new_deaths_per_million_mask,new_deaths_smoothed_per_million_norm,new_deaths_smoothed_per_million_mask,icu_patients_per_million_norm,icu_patients_per_million_mask,hosp_patients_per_million_norm,hosp_patients_per_million_mask,weekly_icu_admissions_per_million_norm,weekly_icu_admissions_per_million_mask,weekly_hosp_admissions_per_million_norm,weekly_hosp_admissions_per_million_mask,total_tests_per_thousand_norm,total_tests_per_thousand_mask,new_tests_per_thousand_norm,new_tests_per_thousand_mask,new_tests_smoothed_per_thousand_norm,new_tests_smoothed_per_thousand_mask,total_vaccinations_per_hundred_norm,total_vaccinations_per_hundred_mask,people_vaccinated_per_hundred_norm,people_vaccinated_per_hundred_mask,people_fully_vaccinated_per_hundred_norm,people_fully_vaccinated_per_hundred_mask,total_boosters_per_hundred_norm,total_boosters_per_hundred_mask,new_vaccinations_smoothed_per_million_norm,new_vaccinations_smoothed_per_million_mask,new_people_vaccinated_smoothed_per_hundred_norm,new_people_vaccinated_smoothed_per_hundred_mask,hospital_beds_per_thousand_norm,hospital_beds_per_thousand_mask,excess_mortality_cumulative_per_million_norm,excess_mortality_cumulative_per_million_mask,total_cases_per_million_6months_ago_norm,total_cases_per_million_6months_ago_mask,total_vaccinations_per_hundred_6months_ago_norm,total_vaccinations_per_hundred_6months_ago_mask,total_cases_for_6months_per_million_norm,total_cases_for_6months_per_million_mask,total_vaccinations_for_6months_per_hundred_norm,total_vaccinations_for_6months_per_hundred_mask,total_deaths_per_million_6months_ago_norm,total_deaths_per_million_6months_ago_mask,total_cases_per_million_for_6months_norm,total_cases_per_million_for_6months_mask,total_vaccinations_per_hundred_for_6months_norm,total_vaccinations_per_hundred_for_6months_mask,total_deaths_per_hundred_for_6months_norm,total_deaths_per_hundred_for_6months_mask
460,31161,2022-03-05,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.405141,0.267506,0.0,1,1.0,0.28,4,0,TRAIN,KOR,2022-03-05,0.112227,0.0,0.021444,0.0,0.102263,0.0,0.026341,0.0,0.006906,0.0,0.016711,0.0,0.095708,0.0,0.230665,0.0,0.0,1.0,0.333295,0.0,0.041239,0.0,0.03042,0.0,0.097058,0.0,0.568536,0.0,0.666925,0.0,0.67129,0.0,0.408985,0.0,0.012697,0.0,0.000342,0.0,0.88913,0.0,0.0,1.0,0.007658,0.0,0.245178,0.0,0.120432,0.0,0.597035,0.0,0.00705,0.0,0.120432,0.0,0.597035,0.0,0.597035,0.0
3687,44961,2022-03-25,1,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0.436953,0.546253,0.105201,1,1.0,0.44,3,1,TRAIN,KOR,2022-03-25,0.297367,0.0,0.028629,0.0,0.185418,0.0,0.042807,0.0,0.012532,0.0,0.046595,0.0,0.115899,0.0,0.0,1.0,0.0,1.0,0.341414,0.0,0.049505,0.0,0.021472,0.0,0.07281,0.0,0.573826,0.0,0.667855,0.0,0.672315,0.0,0.421612,0.0,0.007719,0.0,0.000256,0.0,0.88913,0.0,0.0,1.0,0.008729,0.0,0.312195,0.0,0.331015,0.0,0.501083,0.0,0.007433,0.0,0.331015,0.0,0.501083,0.0,0.501083,0.0
12455,30914,2022-03-04,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0.406505,0.319969,0.03886,1,1.0,0.32,2,1,TEST,KOR,2022-03-04,0.105452,0.0,0.022502,0.0,0.09574,0.0,0.025695,0.0,0.005947,0.0,0.014781,0.0,0.085131,0.0,0.0,1.0,0.0,1.0,0.330999,0.0,0.040748,0.0,0.031652,0.0,0.096766,0.0,0.568216,0.0,0.666847,0.0,0.671211,0.0,0.408321,0.0,0.014362,0.0,0.000427,0.0,0.88913,0.0,0.0,1.0,0.007605,0.0,0.243255,0.0,0.11274,0.0,0.599499,0.0,0.007028,0.0,0.11274,0.0,0.599499,0.0,0.599499,0.0
11754,33160,2022-08-20,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.140678,0.178563,0.0,1,0.0,0.36,5,0,VALIDATE,KOR,2022-08-20,0.589538,0.0,0.010907,0.0,0.066111,0.0,0.077803,0.0,0.002685,0.0,0.007676,0.0,0.054584,0.0,0.046881,0.0,0.0,1.0,0.105158,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.611102,0.0,0.669792,0.0,0.674758,0.0,0.518575,0.0,0.007258,0.0,8.5e-05,0.0,0.88913,0.0,0.0,1.0,0.054956,0.0,0.617943,0.0,0.617473,0.0,0.086013,0.0,0.022395,0.0,0.617473,0.0,0.086013,0.0,0.086013,0.0
9542,33964,2022-03-08,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0.43475,0.429472,0.043824,1,0.0,0.29,3,1,VALIDATE,KOR,2022-03-08,0.12973,0.0,0.017092,0.0,0.118169,0.0,0.027797,0.0,0.005947,0.0,0.020627,0.0,0.107563,0.0,0.242685,0.0,0.0,1.0,0.328183,0.0,0.042505,0.0,0.039818,0.0,0.098528,0.0,0.569299,0.0,0.66708,0.0,0.671448,0.0,0.41078,0.0,0.012364,0.0,0.000512,0.0,0.88913,0.0,0.0,1.0,0.00779,0.0,0.257585,0.0,0.140309,0.0,0.578898,0.0,0.007095,0.0,0.140309,0.0,0.578898,0.0,0.578898,0.0
14311,57848,2022-04-11,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.334387,0.460704,0.140487,1,0.0,0.36,1,1,TEST,KOR,2022-04-11,0.410917,0.0,0.007661,0.0,0.105364,0.0,0.058933,0.0,0.008248,0.0,0.041287,0.0,0.117393,0.0,0.0,1.0,0.0,1.0,0.265204,0.0,0.053267,0.0,0.006447,0.0,0.04235,0.0,0.576163,0.0,0.668552,0.0,0.672866,0.0,0.426796,0.0,0.003894,0.0,0.000427,0.0,0.88913,0.0,0.0,1.0,0.009844,0.0,0.363663,0.0,0.459698,0.0,0.424415,0.0,0.007866,0.0,0.459698,0.0,0.424415,0.0,0.424415,0.0
1344,35311,2022-03-12,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0.406505,0.319969,0.039596,1,1.0,0.43,2,1,TRAIN,KOR,2022-03-12,0.165338,0.0,0.032351,0.0,0.147589,0.0,0.030378,0.0,0.008599,0.0,0.025,0.0,0.113867,0.0,0.0,1.0,0.0,1.0,0.359756,0.0,0.044706,0.0,0.035881,0.0,0.110486,0.0,0.570603,0.0,0.66739,0.0,0.671842,0.0,0.413704,0.0,0.010349,0.0,0.000769,0.0,0.88913,0.0,0.0,1.0,0.008022,0.0,0.273269,0.0,0.180783,0.0,0.556556,0.0,0.007181,0.0,0.180783,0.0,0.556556,0.0,0.556556,0.0
3697,44871,2022-03-25,1,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0.443125,0.644961,0.083925,1,1.0,0.21,1,1,TRAIN,KOR,2022-03-25,0.297367,0.0,0.028629,0.0,0.185418,0.0,0.042807,0.0,0.012532,0.0,0.046595,0.0,0.115899,0.0,0.0,1.0,0.0,1.0,0.341414,0.0,0.049505,0.0,0.021472,0.0,0.07281,0.0,0.573826,0.0,0.667855,0.0,0.672315,0.0,0.421612,0.0,0.007719,0.0,0.000256,0.0,0.88913,0.0,0.0,1.0,0.008729,0.0,0.312195,0.0,0.331015,0.0,0.501083,0.0,0.007433,0.0,0.331015,0.0,0.501083,0.0,0.501083,0.0
75,21303,2021-12-21,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.185797,0.011991,0.0,1,1.0,0.25,1,0,TRAIN,KOR,2021-12-21,0.015335,0.0,0.000438,0.0,0.003477,0.0,0.014458,0.0,0.001663,0.0,0.008181,0.0,0.109168,0.0,0.0,1.0,0.0,1.0,0.137631,0.0,0.025275,0.0,0.010116,0.0,0.033306,0.0,0.463942,0.0,0.650035,0.0,0.638585,0.0,0.168804,0.0,0.142871,0.0,0.013321,0.0,0.88913,0.0,0.0,1.0,0.004493,0.0,0.096841,0.0,0.012906,0.0,0.649068,0.0,0.006109,0.0,0.012906,0.0,0.649068,0.0,0.649068,0.0
12177,47517,2023-01-03,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.113337,0.126465,0.285625,1,1.0,0.3,1,0,VALIDATE,KOR,2023-01-03,0.778451,0.0,0.006835,0.0,0.033206,0.0,0.096732,0.0,0.000928,0.0,0.00777,0.0,0.066224,0.0,0.02672,0.0,0.0,1.0,0.036082,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.670179,0.0,0.675388,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.88913,0.0,0.0,1.0,0.544648,0.0,0.659118,0.0,0.329173,0.0,0.0,1.0,0.074839,0.0,0.329173,0.0,0.0,1.0,0.0,1.0


In [82]:
df_load.isnull().sum(axis=0)

patient_id                                          0
selfcheck_date                                      0
cough                                               0
fever                                               0
sore_throat                                         0
shortness_of_breath                                 0
head_ache                                           0
runny_nose                                          0
muscle_pain                                         0
chills                                              0
loss_of_taste                                       0
loss_of_smell                                       0
sputum                                              0
chest_pain                                          0
indication_other                                    0
indication_abroad                                   0
indication_contact                                  0
global_confirmed_ratio                              0
confirmed_ratio             

In [83]:
df_load['hosp_patients_per_million_norm']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
15371    0.0
15372    0.0
15373    0.0
15374    0.0
15375    NaN
Name: hosp_patients_per_million_norm, Length: 15376, dtype: float64