In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# Load the data
PatientInfo = pd.read_csv('data/PatientInfo.csv')
Time = pd.read_csv('data/Time.csv')
TimeProvince = pd.read_csv('data/TimeProvince.csv')
Region = pd.read_csv('data/Region.csv')
SearchTrend = pd.read_csv('data/SearchTrend.csv')
Weather = pd.read_csv('data/Weather.csv')
Case = pd.read_csv('data/Case.csv')
Policy = pd.read_csv('data/Policy.csv')
SeoulFloating = pd.read_csv('data/SeoulFloating.csv')
TimeAge = pd.read_csv('data/TimeAge.csv')
TimeGender = pd.read_csv('data/TimeGender.csv')

In [3]:
# patient info data
PatientInfo.head()

Unnamed: 0,patient_id,sex,age,country,province,city,infection_case,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,male,50s,Korea,Seoul,Gangseo-gu,overseas inflow,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,male,30s,Korea,Seoul,Jungnang-gu,overseas inflow,,31,,2020-01-30,2020-03-02,,released
2,1000000003,male,50s,Korea,Seoul,Jongno-gu,contact with patient,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,male,20s,Korea,Seoul,Mapo-gu,overseas inflow,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,female,20s,Korea,Seoul,Seongbuk-gu,contact with patient,1000000002.0,2,,2020-01-31,2020-02-24,,released


In [4]:
# columns of patient info data
PatientInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   patient_id          5165 non-null   int64 
 1   sex                 4043 non-null   object
 2   age                 3785 non-null   object
 3   country             5165 non-null   object
 4   province            5165 non-null   object
 5   city                5071 non-null   object
 6   infection_case      4246 non-null   object
 7   infected_by         1346 non-null   object
 8   contact_number      791 non-null    object
 9   symptom_onset_date  690 non-null    object
 10  confirmed_date      5162 non-null   object
 11  released_date       1587 non-null   object
 12  deceased_date       66 non-null     object
 13  state               5165 non-null   object
dtypes: int64(1), object(13)
memory usage: 565.1+ KB


In [5]:
PatientInfo.shape

(5165, 14)

In [6]:
# check for null values
PatientInfo.isnull().sum()

patient_id               0
sex                   1122
age                   1380
country                  0
province                 0
city                    94
infection_case         919
infected_by           3819
contact_number        4374
symptom_onset_date    4475
confirmed_date           3
released_date         3578
deceased_date         5099
state                    0
dtype: int64

In [7]:
# Replace null values in the 'city' column with the most frequent city
most_frequent_city = PatientInfo['city'].mode()[0]
PatientInfo['city'].fillna(most_frequent_city, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  PatientInfo['city'].fillna(most_frequent_city, inplace=True)


In [8]:
# Replace null values in the 'infection_case' column with the most frequent infection case
most_frequent_infection_case = PatientInfo['infection_case'].mode()[0]
PatientInfo['infection_case'].fillna(most_frequent_infection_case, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  PatientInfo['infection_case'].fillna(most_frequent_infection_case, inplace=True)


In [9]:
# drop contact_number column
PatientInfo.drop('contact_number', axis=1, inplace=True)

In [10]:
from datetime import timedelta

# Convert 'confirmed_date' and 'released_date' to datetime.date
PatientInfo['confirmed_date'] = pd.to_datetime(PatientInfo['confirmed_date'], errors='coerce').dt.date
PatientInfo['released_date'] = pd.to_datetime(PatientInfo['released_date'], errors='coerce').dt.date

# Calculate the difference in days
PatientInfo['difference_in_days'] = (
    PatientInfo['released_date'] - PatientInfo['confirmed_date']
).apply(lambda x: x.days if pd.notnull(x) else None)

# Calculate the mean difference, ignoring NaN values
mean_difference = PatientInfo['difference_in_days'].mean()

# Fill missing 'released_date' values by adding the mean difference to 'confirmed_date'
PatientInfo['released_date'] = PatientInfo.apply(
    lambda row: row['released_date'] if pd.notnull(row['released_date']) else (
        row['confirmed_date'] + timedelta(days=int(mean_difference)) if pd.notnull(row['confirmed_date']) else None
    ),
    axis=1
)

# Drop the temporary column used for calculation
PatientInfo.drop(columns=['difference_in_days'], inplace=True)


In [12]:
# Ensure 'confirmed_date' and 'released_date' contain only the date (no time)
PatientInfo['confirmed_date'] = pd.to_datetime(PatientInfo['confirmed_date'], errors='coerce').dt.date
PatientInfo['released_date'] = pd.to_datetime(PatientInfo['released_date'], errors='coerce').dt.date

# Save the updated dataset to a new file
PatientInfo.to_csv('Updated_PatientInfo.csv', index=False)

print("The time component has been removed, leaving only the date.")


The time component has been removed, leaving only the date.


In [14]:
PatientInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5165 entries, 0 to 5164
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   patient_id          5165 non-null   int64 
 1   sex                 4043 non-null   object
 2   age                 3785 non-null   object
 3   country             5165 non-null   object
 4   province            5165 non-null   object
 5   city                5165 non-null   object
 6   infection_case      5165 non-null   object
 7   infected_by         1346 non-null   object
 8   symptom_onset_date  690 non-null    object
 9   confirmed_date      5162 non-null   object
 10  released_date       5162 non-null   object
 11  deceased_date       66 non-null     object
 12  state               5165 non-null   object
dtypes: int64(1), object(12)
memory usage: 524.7+ KB


In [16]:
# Convert 'confirmed_date' and 'released_date' to datetime, then to strings in 'YYYY-MM-DD' format
PatientInfo['confirmed_date'] = pd.to_datetime(PatientInfo['confirmed_date'], errors='coerce').dt.strftime('%Y-%m-%d')
PatientInfo['released_date'] = pd.to_datetime(PatientInfo['released_date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Save the updated dataset to a new file
PatientInfo.to_csv('Updated_PatientInfo.csv', index=False)

print("The time component has been successfully removed, leaving only the date in 'YYYY-MM-DD' format.")


The time component has been successfully removed, leaving only the date in 'YYYY-MM-DD' format.
