In [30]:
# imports 
import numpy as np
import os 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter("ignore")

pd.options.display.max_rows = 10000

# maintain directories well defined
PROJECT_ROOT_DIR = "."
ALL_DATA_DIR = "dat"
DATA_DIR = "novel-covid-data"
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, ALL_DATA_DIR, DATA_DIR)

COLS = ["Sno", "ObservationDate", "Province/State", "Country/Region", "Confirmed", "Deaths"]

# function for saving figures
def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)
    
def initialize_data(dataset, data_path=DATA_PATH, cols=COLS):
    csv_path = os.path.join(data_path, dataset)
    data = pd.read_csv(csv_path)
    return data

data = initialize_data("covid_19_data.csv")
# data.describe()
# data.info() 
data[data["Country/Region"]=="Italy"]

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
480,481,01/31/2020,,Italy,1/31/2020 23:59,2.0,0.0,0.0
539,540,02/01/2020,,Italy,1/31/2020 8:15,2.0,0.0,0.0
608,609,02/02/2020,,Italy,2020-01-31T08:15:53,2.0,0.0,0.0
675,676,02/03/2020,,Italy,2020-01-31T08:15:53,2.0,0.0,0.0
743,744,02/04/2020,,Italy,2020-01-31T08:15:53,2.0,0.0,0.0
814,815,02/05/2020,,Italy,2020-01-31T08:15:53,2.0,0.0,0.0
885,886,02/06/2020,,Italy,2020-01-31T08:15:53,2.0,0.0,0.0
955,956,02/07/2020,,Italy,2020-02-07T17:53:02,3.0,0.0,0.0
1027,1028,02/08/2020,,Italy,2020-02-07T17:53:02,3.0,0.0,0.0
1099,1100,02/09/2020,,Italy,2020-02-07T17:53:02,3.0,0.0,0.0


In [19]:
indexes = data[data["Province/State"]=="Recovered"].index
data.drop(indexes, inplace = True)


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# unstratified split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
# train_set

In [21]:
# stratified split
#split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(data, data["Country/Region"]):
#     strat_train_set = data.loc[train_index]
#     strat_test_set = data.loc[test_index]
    
# strat_train_set

country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]
for i in singles:
    indexes = data[data["Country/Region"] == i].index
    data.drop(indexes, inplace = True)
    
country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Country/Region"]):
   strat_train_set = data.loc[train_index]
   strat_test_set = data.loc[test_index]

In [22]:
# check sampling ratios for US as comparison
# stratified
print("Stratified Test: ", strat_test_set["Country/Region"].value_counts()["US"]/len(strat_test_set))
print("Stratified Train: ", strat_train_set["Country/Region"].value_counts()["US"]/len(strat_train_set))

# unstratified
print("UnStratified Test: ", test_set["Country/Region"].value_counts()["US"]/len(test_set))
print("UnStratified Train: ", train_set["Country/Region"].value_counts()["US"]/len(train_set))

# original data
print("Original Data: ", data["Country/Region"].value_counts()["US"]/len(data))

Stratified Test:  0.19586082783443312
Stratified Train:  0.19087977199429987
UnStratified Test:  0.1952023988005997
UnStratified Train:  0.19280359820089954
Original Data:  0.19337613247735044


In [23]:
covid_data = strat_train_set.copy()
# covid_data

In [24]:
grouped = covid_data.groupby("Country/Region")
missing_all = []
missing_none = []
missing_some = []
for k, df in grouped: 
    if df["Province/State"].isnull().all():
        missing_all.append(k)
    elif df["Province/State"].isnull().any():
        if (~df["Province/State"].isnull()).all():
            missing_none.append(k)
        else: 
            missing_some.append(k)

# print(missing_all, missing_none, missing_some)

In [25]:
def country_to_province(df, col_name="Region"):
    df.loc[df["Province/State"].isnull(), col_name] = df["Country/Region"] 
    df.loc[(~df["Province/State"].isnull()), col_name] = df["Province/State"] 
    df.drop("Country/Region", axis = 1, inplace = True)
    df.drop("Province/State", axis = 1, inplace = True)
    
country_to_province(covid_data)
covid_data[covid_data["Region"]=="Recovered"]


Unnamed: 0,SNo,ObservationDate,Confirmed,Deaths,Region


In [28]:
covid_data["ObsDate"] = pd.to_datetime(covid_data["ObservationDate"])

def days_since_first_obs(df, col_name="Day_Delta"):
    first_day_dict = {}
    covid_data.sort_values(by = "ObservationDate")
    for index, row in covid_data.iterrows():
        if row["Region"] not in first_day_dict.keys() and row["Confirmed"] != 0:
            first_day_dict[row["Region"]] = row["ObsDate"]
    date_deltas
    for index, row in covid_data.iterrows():
        try:     
            row[col_name] = (row["ObsDate"] - first_day_dict[row["Region"]]).days
        except KeyError:
            row[col_name] = 0
            print(row["Region"], " not in dict.")
    df[col_name] = date_deltas

days_since_first_obs(covid_data)
covid_data

American Samoa  not in dict.
Montgomery County, TX  not in dict.
Jervis Bay Territory  not in dict.
External territories  not in dict.
Montgomery County, TX  not in dict.
American Samoa  not in dict.
American Samoa  not in dict.
American Samoa  not in dict.
American Samoa  not in dict.


Unnamed: 0,SNo,ObservationDate,Confirmed,Deaths,Region,ObsDate,Day_Delta
1426,1427.0,02/14/2020,419.0,11.0,Heilongjiang,2020-02-14,
9390,9391.0,03/27/2020,203.0,0.0,Rhode Island,2020-03-27,
15488,15489.0,04/16/2020,862.0,27.0,Cuba,2020-04-16,
8244,8245.0,03/24/2020,382.0,1.0,Croatia,2020-03-24,
14448,14449.0,04/12/2020,897.0,44.0,Puerto Rico,2020-04-12,
14980,14981.0,04/14/2020,4933.0,28.0,United Arab Emirates,2020-04-14,
8240,8241.0,03/24/2020,378.0,3.0,Colombia,2020-03-24,
1754,1755.0,02/18/2020,16.0,0.0,Vietnam,2020-02-18,
2207,2208.0,02/24/2020,527.0,3.0,Sichuan,2020-02-24,
10556,10557.0,03/31/2020,169.0,0.0,Faroe Islands,2020-03-31,
