In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# read case.csv
case = pd.read_csv('data/Case.csv')
case.shape

(174, 8)

In [3]:
case.head()

Unnamed: 0,case_id,province,city,group,infection_case,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652
1,1000002,Seoul,Gwanak-gu,True,Richway,119,37.48208,126.901384
2,1000003,Seoul,Guro-gu,True,Guro-gu Call Center,95,37.508163,126.884387
3,1000004,Seoul,Yangcheon-gu,True,Yangcheon Table Tennis Club,43,37.546061,126.874209
4,1000005,Seoul,Dobong-gu,True,Day Care Center,43,37.679422,127.044374


In [4]:
# check for null values
case.isnull().sum()

 case_id          0
province          0
city              0
group             0
infection_case    0
confirmed         0
latitude          0
longitude         0
dtype: int64

In [5]:
# info 
case.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0    case_id        174 non-null    int64 
 1   province        174 non-null    object
 2   city            174 non-null    object
 3   group           174 non-null    bool  
 4   infection_case  174 non-null    object
 5   confirmed       174 non-null    int64 
 6   latitude        174 non-null    object
 7   longitude       174 non-null    object
dtypes: bool(1), int64(2), object(5)
memory usage: 9.8+ KB


In [6]:
# region data 
region = pd.read_csv('data/Region.csv')
region.shape

(244, 12)

In [7]:
region.head()

Unnamed: 0,code,province,city,latitude,longitude,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,10000,Seoul,Seoul,37.566953,126.977977,607,830,48,1.44,15.38,5.8,22739
1,10010,Seoul,Gangnam-gu,37.518421,127.047222,33,38,0,4.18,13.17,4.3,3088
2,10020,Seoul,Gangdong-gu,37.530492,127.123837,27,32,0,1.54,14.55,5.4,1023
3,10030,Seoul,Gangbuk-gu,37.639938,127.025508,14,21,0,0.67,19.49,8.5,628
4,10040,Seoul,Gangseo-gu,37.551166,126.849506,36,56,1,1.17,14.39,5.7,1080


In [8]:
# region info
region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   code                      244 non-null    int64  
 1   province                  244 non-null    object 
 2   city                      244 non-null    object 
 3   latitude                  244 non-null    float64
 4   longitude                 244 non-null    float64
 5   elementary_school_count   244 non-null    int64  
 6   kindergarten_count        244 non-null    int64  
 7   university_count          244 non-null    int64  
 8   academy_ratio             244 non-null    float64
 9   elderly_population_ratio  244 non-null    float64
 10  elderly_alone_ratio       244 non-null    float64
 11  nursing_home_count        244 non-null    int64  
dtypes: float64(5), int64(5), object(2)
memory usage: 23.0+ KB


In [11]:
# merge case and region data
data = pd.merge(case, region, on='province')
data.head()

Unnamed: 0,case_id,province,city_x,group,infection_case,confirmed,latitude_x,longitude_x,code,city_y,latitude_y,longitude_y,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652,10000,Seoul,37.566953,126.977977,607,830,48,1.44,15.38,5.8,22739
1,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652,10010,Gangnam-gu,37.518421,127.047222,33,38,0,4.18,13.17,4.3,3088
2,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652,10020,Gangdong-gu,37.530492,127.123837,27,32,0,1.54,14.55,5.4,1023
3,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652,10030,Gangbuk-gu,37.639938,127.025508,14,21,0,0.67,19.49,8.5,628
4,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,139,37.538621,126.992652,10040,Gangseo-gu,37.551166,126.849506,36,56,1,1.17,14.39,5.7,1080


In [13]:
# check for null values
data.isnull().sum()


 case_id                    0
province                    0
city_x                      0
group                       0
infection_case              0
confirmed                   0
latitude_x                  0
longitude_x                 0
code                        0
city_y                      0
latitude_y                  0
longitude_y                 0
elementary_school_count     0
kindergarten_count          0
university_count            0
academy_ratio               0
elderly_population_ratio    0
elderly_alone_ratio         0
nursing_home_count          0
dtype: int64

In [14]:
data.shape

(3247, 19)

In [15]:
# check for duplicates
data.duplicated().sum()

np.int64(0)

In [16]:
# save the data
data.to_csv('data/data.csv', index=False)

In [17]:
# check redundant columns
data.columns

Index([' case_id', 'province', 'city_x', 'group', 'infection_case',
       'confirmed', 'latitude_x', 'longitude_x', 'code', 'city_y',
       'latitude_y', 'longitude_y', 'elementary_school_count',
       'kindergarten_count', 'university_count', 'academy_ratio',
       'elderly_population_ratio', 'elderly_alone_ratio',
       'nursing_home_count'],
      dtype='object')

In [19]:
# Select relevant columns for mapping and analysis
columns_to_keep = [
    'code',                     # Unique identifier for the region
    'province',                 # Province or state
    'city_y',                   # City (will rename this to 'city')
    'latitude_y',               # Latitude (will rename this to 'latitude')
    'longitude_y',              # Longitude (will rename this to 'longitude')
    'confirmed',                # Number of confirmed COVID-19 cases
    # Optional columns for filters or additional insights
    'elementary_school_count', 
    'elderly_population_ratio', 
    'elderly_alone_ratio', 
    'nursing_home_count',
    'academy_ratio'
]

# Select the columns
cleaned_data = data[columns_to_keep]

# Rename the columns for clarity
cleaned_data.rename(columns={
    'city_y': 'city',
    'latitude_y': 'latitude',
    'longitude_y': 'longitude'
}, inplace=True)

# Save the cleaned dataset to a new file
cleaned_data.to_csv('data/cleaned_data.csv', index=False)

print("Cleaned dataset saved to 'cleaned_data.csv'.")


Cleaned dataset saved to 'cleaned_data.csv'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data.rename(columns={
