In [1]:
# imports 
import numpy as np
import os 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
import math

# suppress warnings (quite prevalent with pandas and numpy)
warnings.simplefilter("ignore")

pd.options.display.max_rows = 1000

# maintain directories well defined
PROJECT_ROOT_DIR = "."
ALL_DATA_DIR = "dat"
DATA_DIR = "novel-covid-data"
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, ALL_DATA_DIR, DATA_DIR)

# global variables - desired columns from dataset
COLS = ["SNo", "ObservationDate", "Province/State", "Country/Region", "Confirmed", "Deaths"]

# function for initialization
def initialize_data(dataset, data_path=DATA_PATH, cols=COLS):
    csv_path = os.path.join(data_path, dataset)
    data = pd.read_csv(csv_path, usecols=cols)
    return data

# link to data - https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset#covid_19_data.csv
# initialize
data = initialize_data("covid_19_data.csv")

In [2]:
# miscellaneous drops that need to be applied to the entire data 
indexes = data[data["Province/State"]=="Recovered"].index
data.drop(indexes, inplace = True)
data.dropna(subset=["Deaths"], inplace = True)

country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]
for i in singles:
    indexes = data[data["Country/Region"] == i].index
    data.drop(indexes, inplace = True)
    
covid_data = data


# feature adding functions    
def country_to_province(df, col_name="Region"):
    df.loc[df["Province/State"].isnull(), col_name] = df["Country/Region"] 
    df.loc[(~df["Province/State"].isnull()), col_name] = df["Province/State"] 

def days_since_first_obs(df, col_name="Day_Delta"):
    df["ObsDate"]= pd.to_datetime(df["ObservationDate"])
    region_groups = df.groupby("Region")
    df[col_name] = np.nan
    for k, group in region_groups:
        group.sort_values(by="ObsDate", inplace = True)
        first_day = group.iloc[0]["ObsDate"]
        for i in range(len(group)):
            df.ix[(group.iloc[i]["SNo"]-1), col_name] = (group.iloc[i]["ObsDate"] - first_day).days
    df.drop("ObsDate", axis = 1, inplace = True)

country_to_province(covid_data)
days_since_first_obs(covid_data)

In [3]:
cat_attrbs = ["ObservationDate", "Province/State", "Country/Region", "Region"]
num_attrbs = ["SNo", "Confirmed", "Deaths", "Day_Delta"]
covid_data_num = covid_data.copy()
covid_data_cat = covid_data.copy()
covid_data_num.drop(columns = cat_attrbs, inplace=True)
covid_data_cat.drop(columns = num_attrbs, inplace=True)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

imputer = SimpleImputer(strategy = "most_frequent")
covid_data_cat = imputer.fit_transform(covid_data_cat)
one_hot = OneHotEncoder()
covid_data_cat = one_hot.fit_transform(covid_data_cat).toarray()
num_imputer = SimpleImputer(strategy = "median")
covid_data_num = imputer.fit_transform(covid_data_num)

covid_data = np.concatenate((covid_data_num, covid_data_cat), axis = 1)

In [4]:
SNo_ix, confirmed_ix, deaths_ix, day_delta_ix = 0,1,2,3

covid_datadf = pd.DataFrame(covid_data)

from sklearn.model_selection import train_test_split

# unstratified split
train_set, test_set = train_test_split(covid_datadf, test_size=0.2, random_state=42)
train_labels = train_set[deaths_ix].copy()
test_labels = test_set[deaths_ix].copy()
train_set.drop(deaths_ix, axis = 1, inplace = True)
test_set.drop(deaths_ix, axis = 1, inplace = True)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_set, train_labels)

In [None]:
from sklearn.metrics import r2_score

lr_covid_predict = lin_reg.predict(train_set)
lin_score = r2_score(train_labels, lr_covid_predict)
lin_score

In [None]:
some_data = train_set.iloc[8000:8050]
some_labels = train_labels.iloc[8000:8050]

some_predictions = (lin_reg.predict(some_data))
some_labels = list(np.array(some_labels))
for i in range(len(some_labels)):
    print("predicted: {}, actual : {}\n".format(some_predictions[i], some_labels[i]))
    


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_set, train_labels)

In [None]:
tree_covid_predict = tree_reg.predict(train_set)
tree_score = r2_score(train_labels, tree_covid_predict)
tree_score

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(train_set, train_labels)

In [None]:
forest_covid_predict = forest_reg.predict(train_set)
forest_score = r2_score(train_labels, forest_covid_predict)
forest_score

In [None]:
some_data = train_set.iloc[9000:10050]
some_labels = train_labels.iloc[9000:10050]

some_predictions = (forest_reg.predict(some_data))
some_labels = list(np.array(some_labels))
for i in range(len(some_labels)):
    print("predicted: {}, actual : {}\n".format(some_predictions[i], some_labels[i]))

In [None]:
from sklearn.model_selection import cross_val_score

# cross-validating - training n-1 folds of the training data and testing on the remaining one. repeat n times for 
# n accuracy scores. 

def display_scores(scores):
    print("Scores: ", scores)
    print("Means: ", scores.mean())
    print("STD: ", scores.std())

tree_scores = cross_val_score(tree_reg, covid_data_prepared, covid_labels, cv=7)
tree_rmse_scores = np.sqrt(tree_scores)    
lin_scores = cross_val_score(lin_reg, covid_data_prepared, covid_labels, cv=7)
lin_rmse_scores = np.sqrt(lin_scores)  
forest_scores = cross_val_score(forest_reg, covid_data_prepared, covid_labels, cv=7)
forest_rmse_scores = np.sqrt(forest_scores)  

In [None]:
display_scores(tree_scores)

In [None]:
display_scores(lin_scores)

In [None]:
display_scores(forest_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

# sklearn will play with different combinations of hyperparameters to fine tune your model for you 

param_grid = [
{'n_estimators': [10, 30, 100], 'max_features': [2, 8, 16]},
{'bootstrap': [False], 'n_estimators': [30, 100], 'max_features': [8, 16]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=7)
grid_search.fit(covid_data_prepared, covid_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_search = RandomizedSearchCV(estimator = forest_reg, param_distributions = random_grid, n_iter = 50, cv = 7, 
                                   verbose=2, random_state=42, n_jobs = -1)
random_search.fit(train_set, train_labels)

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
cvres = random_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

In [5]:
from sklearn.ensemble import RandomForestRegressor

final_forest = RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [7]:
final_forest.fit(train_set, train_labels)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [10]:
from sklearn.metrics import r2_score

forest_final_predict = final_forest.predict(test_set)
forest_score = r2_score(test_labels, forest_final_predict)
forest_score

0.9933627095762859

In [11]:
some_data = test_set.iloc[1000:1050]
some_labels = test_labels.iloc[1000:1050]

some_predictions = (final_forest.predict(some_data))
some_labels = list(np.array(some_labels))
for i in range(len(some_labels)):
    print("predicted: {}, actual : {}\n".format(some_predictions[i], some_labels[i]))

predicted: 78.16357142857143, actual : 97.0

predicted: 2.0, actual : 2.0

predicted: 0.0, actual : 0.0

predicted: 251.28571428571428, actual : 256.0

predicted: 22.0, actual : 22.0

predicted: 0.38698630136986284, actual : 0.0

predicted: 0.0, actual : 0.0

predicted: 0.0, actual : 0.0

predicted: 5.742857142857233, actual : 6.0

predicted: 5.742857142857233, actual : 6.0

predicted: 3.0, actual : 3.0

predicted: 0.0, actual : 0.0

predicted: 0.09154929577464443, actual : 0.0

predicted: 0.0, actual : 0.0

predicted: 30.192857142857143, actual : 26.0

predicted: 0.25, actual : 0.0

predicted: 2.4385714285714286, actual : 4.0

predicted: 2055.8714285714286, actual : 2248.0

predicted: 7.0, actual : 7.0

predicted: 11.732142857142858, actual : 18.0

predicted: 8.0, actual : 8.0

predicted: 5.232142857142857, actual : 3.0

predicted: 0.013157894736842386, actual : 0.0

predicted: 0.015723270440251274, actual : 0.0

predicted: 3575.565, actual : 3600.0

predicted: 34.90785714285714, actu