In [3]:
import pandas as pd
import os

DATA_DIR = './data'

In [4]:
p_info = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'PatientInfo.csv'))
p_route = pd.read_csv(os.path.join(DATA_DIR, 'raw', 'PatientRoute.csv'))

## Quick Exploratory Analysis of Data
### Patient Info

In [5]:
p_info.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2.0,male,1964.0,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75.0,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5.0,male,1987.0,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31.0,,2020-01-30,2020-03-02,,released
2,1000000003,6.0,male,1964.0,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000000.0,17.0,,2020-01-30,2020-02-19,,released
3,1000000004,7.0,male,1991.0,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1.0,,9.0,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,9.0,female,1992.0,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2.0,1000000000.0,2.0,,2020-01-31,2020-02-24,,released


In [6]:
p_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3326 entries, 0 to 3325
Data columns (total 18 columns):
patient_id            3326 non-null int64
global_num            2237 non-null float64
sex                   3260 non-null object
birth_year            2889 non-null float64
age                   3252 non-null object
country               3326 non-null object
province              3326 non-null object
city                  3247 non-null object
disease               18 non-null object
infection_case        2559 non-null object
infection_order       31 non-null float64
infected_by           806 non-null float64
contact_number        614 non-null float64
symptom_onset_date    471 non-null object
confirmed_date        3323 non-null object
released_date         1218 non-null object
deceased_date         60 non-null object
state                 3326 non-null object
dtypes: float64(5), int64(1), object(12)
memory usage: 467.8+ KB


In [7]:
p_info['state'].value_counts()

released    1637
isolated    1622
deceased      67
Name: state, dtype: int64

In [8]:
p_info['country'].value_counts()

Korea            3300
China              11
United States       6
Thailand            2
France              1
Canada              1
Foreign             1
Mongolia            1
Indonesia           1
Spain               1
Switzerland         1
Name: country, dtype: int64

In [9]:
p_info['sex'].value_counts()

female    1831
male      1429
Name: sex, dtype: int64

#### Summary
From the patient data set info output above we can see that a few columns are incomplete and contain NaN values. For example out of 3326 patients there are only 3260 with information about their sex. We must clean all rows from the dataset that contain NaN in necessary columns. We will also drop columns that have no use in this analysis or can be derived from the values of other columns.

### Cleaning Process

In [11]:
def clean_patient_info(df, to_file=False):
    """Removes rows with NaN values in important columns and drops uneccessary columns.
    """
    df.drop(['global_num', 'birth_year', 'city', 'province', 'country'], axis=1, inplace=True)
    df.dropna(subset=['sex', 'age'], inplace=True)
    
    if to_file:
        outfile_path = os.path.join(DATA_DIR, 'clean', to_file)
        df.to_csv(outfile_path, index=False)

    return df

clean_p_info = clean_patient_info(p_info.copy(), 'PatientInfoClean.csv')

### Patient Route

In [12]:
p_route.head()

Unnamed: 0,patient_id,global_num,date,province,city,type,latitude,longitude
0,1000000001,2.0,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632
1,1000000001,2.0,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659
2,1000000002,5.0,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048
3,1000000002,5.0,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822
4,1000000002,5.0,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534


In [13]:
p_route.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5963 entries, 0 to 5962
Data columns (total 8 columns):
patient_id    5963 non-null int64
global_num    3154 non-null float64
date          5963 non-null object
province      5963 non-null object
city          5963 non-null object
type          5963 non-null object
latitude      5963 non-null float64
longitude     5963 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 372.8+ KB


In [14]:
len(p_route['patient_id'].unique())

1109

IMPORTANT: We don't have patient route info for all patients in the PatientInfo.csv file!! We could assume patients with no patient route data have not moved from their location and are instead still in the location given in the PatientInfo.csv file.

In [15]:
ids_to_drop = pd.concat([p_info['patient_id'], clean_p_info['patient_id']]).drop_duplicates(keep=False)

Above we calculate the id's that we removed from the PatientInfo.csv file when cleaning it. We need to remove this id's (if they exist) from the PatientRoute.csv file as well.

### Cleaning Process

In [18]:
def clean_patient_route(df, remove_ids, to_file=False):
    """Cleans PatientRoute csv file, removes uneccessary columns and ids.
    """
    df.drop('global_num', axis=1, inplace=True)
    
    # filter rows if they have an id in remove_ids
    df = df[~df['patient_id'].isin(remove_ids)]
    
    if to_file:
        outfile_path = os.path.join(DATA_DIR, 'clean', to_file)
        df.to_csv(outfile_path, index=False)

    return df

clean_p_route = clean_patient_route(p_route.copy(), ids_to_drop, 'PatientRouteClean.csv')

## Data Storage Explanation

We have decided not to combine the two data sets to reduce the amount of redundancy. This way we have two files, one with info regarding patients (the PatientInfoClean.csv file) and another regarding possible routes taken by patients in the patiens file.

In [17]:
set(clean_p_route['patient_id']) - set(clean_p_info['patient_id']) # produces empty set

set()

Above we can see that we have no patients in the PatientRoute data that is not in the PatientInfo data however this is not true in reverse.