In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [67]:
# Load the data
PatientInfo = pd.read_csv('data/PatientInfo.csv')
Time = pd.read_csv('data/Time.csv')
TimeProvince = pd.read_csv('data/TimeProvince.csv')
Region = pd.read_csv('data/Region.csv')
SearchTrend = pd.read_csv('data/SearchTrend.csv')
Weather = pd.read_csv('data/Weather.csv')
Case = pd.read_csv('data/Case.csv')
Policy = pd.read_csv('data/Policy.csv')
SeoulFloating = pd.read_csv('data/SeoulFloating.csv')
TimeAge = pd.read_csv('data/TimeAge.csv')
TimeGender = pd.read_csv('data/TimeGender.csv')

In [68]:
# patient info data
PatientInfo.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,2,,2020-01-31,2020-02-24,,released


In [69]:
# columns of patient info data
PatientInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   patient_id          5165 non-null   int64 
 1   sex                 4043 non-null   object
 2   age                 3785 non-null   object
 3   country             5165 non-null   object
 4   province            5165 non-null   object
 5   city                5071 non-null   object
 6   infection_case      4246 non-null   object
 7   infected_by         1346 non-null   object
 8   contact_number      791 non-null    object
 9   symptom_onset_date  690 non-null    object
 10  confirmed_date      5162 non-null   object
 11  released_date       1587 non-null   object
 12  deceased_date       66 non-null     object
 13  state               5165 non-null   object
dtypes: int64(1), object(13)
memory usage: 565.1+ KB


In [70]:
PatientInfo.shape

(5165, 14)

In [71]:
# check for null values
PatientInfo.isnull().sum()

patient_id               0
sex                   1122
age                   1380
country                  0
province                 0
city                    94
infection_case         919
infected_by           3819
contact_number        4374
symptom_onset_date    4475
confirmed_date           3
released_date         3578
deceased_date         5099
state                    0
dtype: int64

In [72]:
# Replace null values in the 'city' column with the most frequent city
most_frequent_city = PatientInfo['city'].mode()[0]
PatientInfo['city'] = PatientInfo['city'].fillna(most_frequent_city)

In [73]:
# Replace null values in the 'infection_case' column with the most frequent infection case
most_frequent_infection_case = PatientInfo['infection_case'].mode()[0]
PatientInfo['infection_case'] = PatientInfo['infection_case'].fillna(most_frequent_infection_case)

In [74]:
# drop contact_number column
PatientInfo.drop('contact_number', axis=1, inplace=True)

In [75]:
from datetime import timedelta

# Convert 'confirmed_date', 'released_date', and 'deceased_date' to datetime.date
PatientInfo['confirmed_date'] = pd.to_datetime(PatientInfo['confirmed_date'], errors='coerce').dt.date
PatientInfo['released_date'] = pd.to_datetime(PatientInfo['released_date'], errors='coerce').dt.date
PatientInfo['deceased_date'] = pd.to_datetime(PatientInfo['deceased_date'], errors='coerce').dt.date

# Calculate the difference in days
PatientInfo['difference_in_days'] = (
    PatientInfo['released_date'] - PatientInfo['confirmed_date']
).apply(lambda x: x.days if pd.notnull(x) else None)

# Calculate the mean difference, ignoring NaN values
mean_difference = PatientInfo['difference_in_days'].mean()

# Update 'released_date' column
PatientInfo['released_date'] = PatientInfo.apply(
    lambda row: row['released_date'] if pd.notnull(row['released_date']) else (
        row['confirmed_date'] + timedelta(days=int(mean_difference)) if pd.notnull(row['confirmed_date']) else None
    ),
    axis=1
)

# Check and update 'released_date' if it exceeds 'deceased_date'
PatientInfo['released_date'] = PatientInfo.apply(
    lambda row: row['deceased_date'] if pd.notnull(row['deceased_date']) and 
                 pd.notnull(row['released_date']) and row['released_date'] > row['deceased_date'] 
                 else row['released_date'],
    axis=1
)

# Drop the temporary column used for calculation
PatientInfo.drop(columns=['difference_in_days'], inplace=True)

In [76]:
PatientInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   patient_id          5165 non-null   int64 
 1   sex                 4043 non-null   object
 2   age                 3785 non-null   object
 3   country             5165 non-null   object
 4   province            5165 non-null   object
 5   city                5165 non-null   object
 6   infection_case      5165 non-null   object
 7   infected_by         1346 non-null   object
 8   symptom_onset_date  690 non-null    object
 9   confirmed_date      5162 non-null   object
 10  released_date       5162 non-null   object
 11  deceased_date       66 non-null     object
 12  state               5165 non-null   object
dtypes: int64(1), object(12)
memory usage: 524.7+ KB


In [77]:
# Convert 'confirmed_date' and 'released_date' to datetime, then to strings in 'YYYY-MM-DD' format
PatientInfo['confirmed_date'] = pd.to_datetime(PatientInfo['confirmed_date'], errors='coerce').dt.strftime('%Y-%m-%d')
PatientInfo['released_date'] = pd.to_datetime(PatientInfo['released_date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Save the updated dataset to a new file
PatientInfo.to_csv('Updated_PatientInfo.csv', index=False)

print("The time component has been successfully removed, leaving only the date in 'YYYY-MM-DD' format.")


The time component has been successfully removed, leaving only the date in 'YYYY-MM-DD' format.


In [78]:
# show the first few rows of the updated dataset
PatientInfo.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,2020-01-22,2020-01-23,2020-02-05,NaT,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,,2020-01-30,2020-03-02,NaT,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,,2020-01-30,2020-02-19,NaT,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,2020-01-26,2020-01-30,2020-02-15,NaT,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,,2020-01-31,2020-02-24,NaT,released


In [79]:
updated_patient_info = pd.read_csv('Updated_PatientInfo.csv')


In [80]:
#show the first few rows of the updated dataset
updated_patient_info.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,,2020-01-31,2020-02-24,,released


In [81]:
# print the deceasd patients
deceased_patients = updated_patient_info[updated_patient_info['state'] == 'deceased']
deceased_patients.count()

patient_id            78
sex                   75
age                   75
country               78
province              78
city                  78
infection_case        78
infected_by            3
symptom_onset_date     6
confirmed_date        78
released_date         78
deceased_date         66
state                 78
dtype: int64

In [82]:
deceased_patients.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,symptom_onset_date,confirmed_date,released_date,deceased_date,state
12,1000000013,male,80s,Korea,Seoul,Jongno-gu,contact with patient,1000000017.0,,2020-02-16,2020-03-11,,deceased
108,1000000109,male,90s,Korea,Seoul,etc,contact with patient,,2020-03-02,2020-03-07,2020-03-31,,deceased
284,1000000285,male,40s,Korea,Seoul,Mapo-gu,Guro-gu Call Center,,,2020-03-19,2020-04-12,,deceased
472,1000000473,male,60s,Korea,Seoul,etc,contact with patient,1000000443.0,,2020-03-31,2020-04-24,,deceased
996,1000000997,,,Korea,Seoul,Seodaemun-gu,contact with patient,,,2020-06-08,2020-07-02,,deceased


In [83]:
# show the patients with no null values in the deceased patients deceased_date column
deceased_patients_no_null = deceased_patients[deceased_patients['deceased_date'].notnull()]

In [85]:
deceased_patients_no_null.tail()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,symptom_onset_date,confirmed_date,released_date,deceased_date,state
4902,6020100104,male,60s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-20,2020-02-19,2020-02-19,deceased
4903,6020100107,male,60s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-21,2020-02-24,2020-02-24,deceased
4904,6020100205,female,50s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-21,2020-02-21,deceased
4905,6020100286,male,60s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-23,2020-02-23,deceased
4906,6020100298,male,50s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-25,2020-02-25,deceased


In [86]:
# Ensure 'released_date' is updated with 'deceased_date' for deceased patients who died before the release date,
# while keeping the 'deceased_date' column intact.
updated_patient_info.loc[
    (updated_patient_info['state'] == 'deceased') & (updated_patient_info['deceased_date'] < updated_patient_info['released_date']),
    'released_date'
] = updated_patient_info.loc[
    (updated_patient_info['state'] == 'deceased') & (updated_patient_info['deceased_date'] < updated_patient_info['released_date']),
    'deceased_date'
]

In [87]:
# print all those who are deceased
updated_patient_info[updated_patient_info['state'] == 'deceased']

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,symptom_onset_date,confirmed_date,released_date,deceased_date,state
12,1000000013,male,80s,Korea,Seoul,Jongno-gu,contact with patient,1000000017,,2020-02-16,2020-03-11,,deceased
108,1000000109,male,90s,Korea,Seoul,etc,contact with patient,,2020-03-02,2020-03-07,2020-03-31,,deceased
284,1000000285,male,40s,Korea,Seoul,Mapo-gu,Guro-gu Call Center,,,2020-03-19,2020-04-12,,deceased
472,1000000473,male,60s,Korea,Seoul,etc,contact with patient,1000000443,,2020-03-31,2020-04-24,,deceased
996,1000000997,,,Korea,Seoul,Seodaemun-gu,contact with patient,,,2020-06-08,2020-07-02,,deceased
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,6020100107,male,60s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-21,2020-02-24,2020-02-24,deceased
4904,6020100205,female,50s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-21,2020-02-21,deceased
4905,6020100286,male,60s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-23,2020-02-23,deceased
4906,6020100298,male,50s,Korea,Gyeongsangbuk-do,Cheongdo-gun,Cheongdo Daenam Hospital,,,2020-02-22,2020-02-25,2020-02-25,deceased


In [88]:
# find the count of distinct infection cases
infection_cases = updated_patient_info['infection_case'].value_counts()

In [None]:
# display the count of distinct infection cases
infection_cases

AttributeError: 'Series' object has no attribute 'distinct'