In [4]:
import os 
import requests

for filename in ['time_series_covid19_confirmed_global.csv', 'time_series_covid19_deaths_global.csv', 'time_series_covid19_recovered_global.csv', 'time_series_covid19_confirmed_US.csv', 'time_series_covid19_deaths_US.csv']:
    print(f'Downloading {filename}')
    url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
    myfile = requests.get(url)
    open(filename, 'wb').write(myfile.content)

Downloading time_series_covid19_confirmed_global.csv
Downloading time_series_covid19_deaths_global.csv
Downloading time_series_covid19_recovered_global.csv
Downloading time_series_covid19_confirmed_US.csv
Downloading time_series_covid19_deaths_US.csv


In [5]:
from datetime import datetime
import pandas as pd 

def _convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]

confirmed_global_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
_convert_date_str(confirmed_global_df)

deaths_global_df = pd.read_csv('time_series_covid19_deaths_global.csv')
_convert_date_str(deaths_global_df)

recovered_global_df = pd.read_csv('time_series_covid19_recovered_global.csv')
_convert_date_str(recovered_global_df)

In [6]:
recovered_global_df

Unnamed: 0,Province/State,Country/Region,Lat,Long,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2021-07-18,2021-07-19,2021-07-20,2021-07-21,2021-07-22,2021-07-23,2021-07-24,2021-07-25,2021-07-26,2021-07-27
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,82586,82586,82586,82586,82586,82586,82586,82586,82586,82586
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,130074,130081,130086,130097,130109,130118,130125,130139,130152,130166
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,105604,106337,107041,107776,108537,109349,109951,110577,111322,112050
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,13844,13897,13930,13930,13988,13988,13988,13988,14077,14113
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,34755,34790,34857,34893,35082,35284,35423,35474,35686,35742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,,Vietnam,14.058324,108.277199,0,0,0,0,0,0,...,10667,11047,11443,11971,13421,15536,17583,19342,21344,22946
260,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,310525,310601,310601,310961,310961,311380,311380,311526,311526,311560
261,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,4159,4162,4162,4162,4162,4163,4166,4166,4166,4168
262,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,171699,173320,174728,175429,177419,179171,180535,181638,182665,183534


## 데이터 클렌징

In [7]:
import numpy as np 

removed_states = "Recovered|Grand Princess|Diamond Princess"
removed_countries = "US|The West Bank and Gaza"

confirmed_global_df.rename(columns={"Province/State": "Province_State", "Country/Region" : "Country_Region"}, inplace=True)
deaths_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Province_State"].replace(np.nan, "nan").str.match(removed_states)]
deaths_global_df = deaths_global_df[~deaths_global_df["Province_State"].replace(np.nan, 'nan').str.match(removed_states)]
recovered_global_df = recovered_global_df[~recovered_global_df["Province_State"].replace(np.nan, 'nan').str.match(removed_states)]

confirmed_global_df = confirmed_global_df[~confirmed_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
deaths_global_df = deaths_global_df[~deaths_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]
recovered_global_df = recovered_global_df[~recovered_global_df["Country_Region"].replace(np.nan, "nan").str.match(removed_countries)]

In [8]:
confirmed_global_melt_df = confirmed_global_df.melt(id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='ConfirmedCases')
deaths_global_melt_df = deaths_global_df.melt(id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Deaths')
recovered_global_melt_df = deaths_global_df.melt(id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Recovered')

In [9]:
train = confirmed_global_melt_df.merge(deaths_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
train = train.merge(recovered_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])

train

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,ConfirmedCases,Deaths,Recovered
0,Afghanistan,,33.939110,67.709953,2020-01-22,0,0,0
1,Albania,,41.153300,20.168300,2020-01-22,0,0,0
2,Algeria,,28.033900,1.659600,2020-01-22,0,0,0
3,Andorra,,42.506300,1.521800,2020-01-22,0,0,0
4,Angola,,-11.202700,17.873900,2020-01-22,0,0,0
...,...,...,...,...,...,...,...,...
152623,Vietnam,,14.058324,108.277199,2021-07-27,117121,524,524
152624,West Bank and Gaza,,31.952200,35.233200,2021-07-27,316189,3600,3600
152625,Yemen,,15.552727,48.516388,2021-07-27,7022,1374,1374
152626,Zambia,,-13.133897,27.849332,2021-07-27,192956,3316,3316


In [10]:
confirmed_us_df = pd.read_csv('time_series_covid19_confirmed_US.csv') 
deaths_us_df = pd.read_csv('time_series_covid19_deaths_US.csv') 
confirmed_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True, axis=1) 
deaths_us_df.drop(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key', 'Population'], inplace=True, axis=1) 
confirmed_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True) 
deaths_us_df.rename({'Long_': 'Long'}, axis=1, inplace=True) 
_convert_date_str(confirmed_us_df)
_convert_date_str(deaths_us_df)

In [11]:
confirmed_us_df = confirmed_us_df[~confirmed_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")] 
deaths_us_df = deaths_us_df[~deaths_us_df.Province_State.str.match("Diamond Princess|Grand Princess|Recovered|Northern Mariana Islands|American Samoa")]

In [12]:
confirmed_us_df = confirmed_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index() 
deaths_us_df = deaths_us_df.groupby(['Country_Region', 'Province_State']).sum().reset_index()

In [13]:
confirmed_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)
deaths_us_df.drop(['Lat', 'Long'], inplace=True, axis=1)

In [14]:
confirmed_us_melt_df = confirmed_us_df.melt( id_vars=['Country_Region', 'Province_State'], value_vars=confirmed_us_df.columns[2:], var_name='Date', value_name='ConfirmedCases') 
deaths_us_melt_df = deaths_us_df.melt( id_vars=['Country_Region', 'Province_State'], value_vars=deaths_us_df.columns[2:], var_name='Date', value_name='Deaths') 
train_us = confirmed_us_melt_df.merge(deaths_us_melt_df, on=['Country_Region', 'Province_State', 'Date'])



In [15]:
train = pd.concat([train, train_us], axis=0, sort=False) 
train_us.rename({'Country_Region': 'country', 'Province_State': 'province', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities'}, axis=1, inplace=True) 
train_us['country_province'] = train_us['country'].fillna('') + '/' + train_us['province'].fillna('')

In [16]:
train.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities', 'Recovered': 'recovered'}, axis=1, inplace=True) 
train['country_province'] = train['country'].fillna('') + '/' + train['province'].fillna('')

In [17]:
train

Unnamed: 0,country,province,Lat,Long,date,confirmed,fatalities,recovered,country_province
0,Afghanistan,,33.93911,67.709953,2020-01-22,0,0,0.0,Afghanistan/
1,Albania,,41.15330,20.168300,2020-01-22,0,0,0.0,Albania/
2,Algeria,,28.03390,1.659600,2020-01-22,0,0,0.0,Algeria/
3,Andorra,,42.50630,1.521800,2020-01-22,0,0,0.0,Andorra/
4,Angola,,-11.20270,17.873900,2020-01-22,0,0,0.0,Angola/
...,...,...,...,...,...,...,...,...,...
29857,US,Virginia,,,2021-07-27,691018,11515,,US/Virginia
29858,US,Washington,,,2021-07-27,470333,6097,,US/Washington
29859,US,West Virginia,,,2021-07-27,166297,2936,,US/West Virginia
29860,US,Wisconsin,,,2021-07-27,684119,8184,,US/Wisconsin
