In [3]:
import pandas as pd
import numpy as np
import pycountry as pc

In [4]:
source_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
files = [
    'time_series_covid19_confirmed_global.csv',
    'time_series_covid19_deaths_global.csv',
    'time_series_covid19_recovered_global.csv'
]

In [7]:
# fetching csv files from github

dfs_raw = [pd.read_csv(source_url + f) for f in files]
dfs_raw[0].head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,84,94,110,110,120,170,174,237,273,281
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,146,174,186,197,212,223,243,259,277,304
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,302,367,409,454,511,584,716,847,986,1171
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,188,224,267,308,334,370,376,390,428,439
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,3,4,4,5,7,7,7,8,8,8


In [8]:
# deleting unnecessary columns

drop_columns = ['Province/State', 'Lat', 'Long']
dfs_dropped = [df.drop(drop_columns, axis=1) for df in dfs_raw]

sample = dfs_dropped[0]
sample[sample['Country/Region'] == 'Australia']

Unnamed: 0,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20
8,Australia,0,0,0,0,0,0,0,0,0,...,39,53,62,71,77,78,80,84,87,91
9,Australia,0,0,0,0,3,4,4,4,4,...,1029,1219,1405,1617,1791,2032,2032,2182,2298,2389
10,Australia,0,0,0,0,0,0,0,0,0,...,6,12,12,15,15,15,17,19,21,22
11,Australia,0,0,0,0,0,0,0,1,3,...,443,493,555,625,656,689,743,781,835,873
12,Australia,0,0,0,0,0,0,0,0,0,...,170,235,257,287,299,305,337,367,367,396
13,Australia,0,0,0,0,0,0,0,0,0,...,36,47,47,62,66,66,69,69,72,74
14,Australia,0,0,0,0,1,1,1,1,2,...,466,520,574,685,769,821,917,968,1036,1085
15,Australia,0,0,0,0,0,0,0,0,0,...,175,231,231,278,311,355,364,392,400,400


In [10]:
%%time

# converting countries names to alpha 2 codes

manual_coding = {
    'Congo (Brazzaville)': 'CD',
    'Congo (Kinshasa)': 'CD',
    'Korea, South': 'KR',
    'Taiwan*': 'TW',
    'Laos': 'LA',
    'Burma': 'MM',
}

missed_names = []

def get_country_code(name):
    try:
        country_data = pc.countries.search_fuzzy(name)        
        return country_data[0].alpha_2
    except LookupError:
        manual_lookup = manual_coding.get(name)
        if manual_lookup is None:
            missed_names.append(name)
            
        return manual_lookup

def add_codes_column(df):
    codes = [get_country_code(record['Country/Region']) for _, record in df.iterrows()]
    df['Alpha2'] = codes
    return df

dfs_with_codes = [add_codes_column(df) for df in dfs_dropped]
dfs_with_codes = [df.drop(['Country/Region'], axis=1) for df in dfs_with_codes]

print(f'missed names: {missed_names}')
dfs_with_codes[0].head(10)

missed names: ['Diamond Princess', 'West Bank and Gaza', 'MS Zaandam', 'Diamond Princess', 'West Bank and Gaza', 'MS Zaandam', 'Diamond Princess', 'West Bank and Gaza', 'MS Zaandam']
CPU times: user 39 s, sys: 3.76 ms, total: 39 s
Wall time: 39 s


Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20,Alpha2
0,0,0,0,0,0,0,0,0,0,0,...,94,110,110,120,170,174,237,273,281,AF
1,0,0,0,0,0,0,0,0,0,0,...,174,186,197,212,223,243,259,277,304,AL
2,0,0,0,0,0,0,0,0,0,0,...,367,409,454,511,584,716,847,986,1171,DZ
3,0,0,0,0,0,0,0,0,0,0,...,224,267,308,334,370,376,390,428,439,AD
4,0,0,0,0,0,0,0,0,0,0,...,4,4,5,7,7,7,8,8,8,AO
5,0,0,0,0,0,0,0,0,0,0,...,7,7,7,7,7,7,7,9,15,AG
6,0,0,0,0,0,0,0,0,0,0,...,502,589,690,745,820,1054,1054,1133,1265,AR
7,0,0,0,0,0,0,0,0,0,0,...,290,329,407,424,482,532,571,663,736,AM
8,0,0,0,0,0,0,0,0,0,0,...,53,62,71,77,78,80,84,87,91,AU
9,0,0,0,0,3,4,4,4,4,4,...,1219,1405,1617,1791,2032,2032,2182,2298,2389,AU


In [11]:
# grouping records by country code

dfs_grouped = [df.groupby(['Alpha2'], as_index=False).sum() for df in dfs_with_codes]
dfs_grouped[0].head()

Unnamed: 0,Alpha2,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20
0,AD,0,0,0,0,0,0,0,0,0,...,188,224,267,308,334,370,376,390,428,439
1,AE,0,0,0,0,0,0,0,4,4,...,333,333,405,468,570,611,664,814,1024,1264
2,AF,0,0,0,0,0,0,0,0,0,...,84,94,110,110,120,170,174,237,273,281
3,AG,0,0,0,0,0,0,0,0,0,...,3,7,7,7,7,7,7,7,9,15
4,AL,0,0,0,0,0,0,0,0,0,...,146,174,186,197,212,223,243,259,277,304


In [13]:
# saving clean files

dest_dir = './who-data/'

for df, file in zip(dfs_grouped, files):
    df.to_csv(f'{dest_dir}/CLEAN-{file}', index=False, header=True)