In [1]:
import pandas as pd
import os
from pathlib import Path

In [2]:
try:
    print('Creating ´data´ directory...')
    os.mkdir('data')
    print('´data´ directory created.')
except FileExistsError:
    print('Directory ´data´ already exists.')

Creating ´data´ directory...
Directory ´data´ already exists.


In [3]:
DATA_PATH = Path('COVID-19/csse_covid_19_data/csse_covid_19_time_series')

## READ DATA

In [4]:
confirmed_df = pd.read_csv(DATA_PATH/'time_series_covid19_confirmed_global.csv').replace('Viet Nam', 'Vietnam')
death_df = pd.read_csv(DATA_PATH/'time_series_covid19_deaths_global.csv').replace('Viet Nam', 'Vietnam')
recovered_df = pd.read_csv(DATA_PATH/'time_series_covid19_recovered_global.csv').replace('Viet Nam', 'Vietnam')

In [5]:
all_data = {'Active cases': confirmed_df.copy(), 'Deceased': death_df.copy(), 'Recovered cases': recovered_df.copy()}

## CREATE DATA FOR WORLD

#### HANDLE MISSING VALUES & INDEX & DROP COLUMNS

In [6]:
def df_prep_all(df, key, cols_to_drop):
    df['Province/State'] = df['Province/State'].fillna(confirmed_df['Country/Region'])
    df = df.set_index('Province/State').drop(cols_to_drop, axis=1).sum(axis=0)
    return pd.DataFrame(df, columns = [key])

In [7]:
cols_to_drop_world = ['Country/Region', 'Lat', 'Long']

all_data = {k: df_prep_all(df, k, cols_to_drop_world) for k, df in all_data.items()}

#### CONCATENATE & WRITE

In [8]:
df = pd.concat(all_data.values(), axis=1)

In [9]:
df.to_csv(os.path.join('data', 'total.csv'))

## CREATE DATA FOR COUNTRIES-REGIONS

In [10]:
def df_prep_region(df, cols_to_drop):
    return df.reset_index(drop=True).drop(
        cols_to_drop, axis=1).groupby(
        'Country/Region').agg(
        pd.np.sum)

def get_data_for_region(region, region_data):
    to_concat = []
    for k, v in region_data.items():
        region_chunk = region_data[k].loc[region]
        region_chunk.name = k
        to_concat.append(region_chunk)
    return pd.concat(to_concat, axis=1)

In [11]:
cols_to_drop_region = ['Lat', 'Long']

region_data = {'Active cases': confirmed_df.copy(), 'Deceased': death_df.copy(), 'Recovered cases': recovered_df.copy()}
region_data = {k: df_prep_region(df, cols_to_drop_region) for k, df in region_data.items()}

In [12]:
region_data['Active cases'].head()

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20,4/4/20,4/5/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,110,110,120,170,174,237,273,281,299,349
Albania,0,0,0,0,0,0,0,0,0,0,...,186,197,212,223,243,259,277,304,333,361
Algeria,0,0,0,0,0,0,0,0,0,0,...,409,454,511,584,716,847,986,1171,1251,1320
Andorra,0,0,0,0,0,0,0,0,0,0,...,267,308,334,370,376,390,428,439,466,501
Angola,0,0,0,0,0,0,0,0,0,0,...,4,5,7,7,7,8,8,8,10,14


In [13]:
region_data['Active cases'].shape

(183, 75)

#### PREPARE AND WRITE

In [14]:
for k, df in region_data.items():
    df.to_csv(os.path.join('data', f'regional-{k.replace(" ", "-").lower()}.csv'))

## CREATE DATA FOR STATES-PROVINCES

In [15]:
cols_to_drop_state = ['Country/Region', 'Lat', 'Long']

def df_prep_state(df, cols_to_drop):
    df['Province/State'] = df['Province/State'].fillna(df['Country/Region'])
    df = df[~((df['Country/Region'] == 'Canada') & (df['Province/State'] == 'Diamond Princess'))]
    return df.set_index('Province/State').drop(cols_to_drop, axis=1)

In [16]:
state_data = {'Active cases': confirmed_df.copy(), 'Deceased': death_df.copy(), 'Recovered cases': recovered_df.copy()}

In [17]:
state_data = {k: df_prep_state(v, cols_to_drop_state) for k, v in state_data.items()} 

In [18]:
state_data['Active cases'].shape

(261, 75)

In [19]:
state_data['Active cases'].shape

(261, 75)

#### PREPARE AND WRITE

In [20]:
for k, df in state_data.items():
    df.to_csv(os.path.join('data', f'state-{k.replace(" ", "-").lower()}.csv'))