In [36]:
# imports 
import numpy as np
import os 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
import math

# suppress warnings (quite prevalent with pandas and numpy)
warnings.simplefilter("ignore")

pd.options.display.max_rows = 1000

# maintain directories well defined
PROJECT_ROOT_DIR = "."
ALL_DATA_DIR = "dat"
DATA_DIR = "novel-covid-data"
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, ALL_DATA_DIR, DATA_DIR)

# global variables - desired columns from dataset
COLS = ["SNo", "ObservationDate", "Province/State", "Country/Region", "Confirmed", "Deaths"]

# function for initialization
def initialize_data(dataset, data_path=DATA_PATH, cols=COLS):
    csv_path = os.path.join(data_path, dataset)
    data = pd.read_csv(csv_path, usecols=cols)
    return data

# link to data - https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset#covid_19_data.csv
# initialize
data = initialize_data("covid_19_data.csv")
data

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Confirmed,Deaths
0,1,01/22/2020,Anhui,Mainland China,1.0,0.0
1,2,01/22/2020,Beijing,Mainland China,14.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,6.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1.0,0.0
4,5,01/22/2020,Gansu,Mainland China,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,26.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,2.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1.0,0.0
8,9,01/22/2020,Hainan,Mainland China,4.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1.0,0.0


In [33]:
# miscellaneous drops that need to be applied to the entire data 

indexes = data[data["Province/State"]=="Recovered"].index
data.drop(indexes, inplace = True)
data.dropna(subset=["Deaths"], inplace = True)

country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]
for i in singles:
    indexes = data[data["Country/Region"] == i].index
    data.drop(indexes, inplace = True)

covid_data = data

In [34]:
num_attributes = ["Confirmed", "SNo"]
cat_attributes = ["ObservationDate", "Province/State", "Country/Region"]

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipeline = Pipeline([
    ("cat_imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot_encoder", OneHotEncoder(categories="auto")),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attributes),
    ("cat", cat_pipeline, cat_attributes),
    ])

covid_data_prepared = full_pipeline.fit_transform(covid_data)

In [40]:
covid_data_prepared[:, 0].toarray()

array([[-0.19239774],
       [-0.19141337],
       [-0.19201913],
       ...,
       [-0.19179197],
       [-0.17854085],
       [-0.09645965]])

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# unstratified split
train_set, test_set = train_test_split(covid_data_prepared, test_size=0.2, random_state=42)
covid_data_prepared = train_set

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
region_ix, country_ix = 2, 3

class CombineLocations(BaseEstimator, TransformerMixin):
    def __init__(self, add_combined_region = True): 
        self.add_combined_region = add_combined_region
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        pass

In [30]:
obsdate_ix, region_ix = 1, 6
class DaysObserved(BaseEstimator, TransformerMixin):
    def __init__(self, add_days_observed = True): 
        self.add_days_observed = add_days_observed
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        pass


In [32]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(covid_data_prepared, covid_labels)

ValueError: Found input variables with inconsistent numbers of samples: [13333, 16667]

In [9]:
from sklearn.metrics import r2_score

lr_covid_predict = lin_reg.predict(covid_data_prepared)
lin_score = r2_score(covid_labels, lr_covid_predict)
lin_score

0.8789928459753779

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(covid_data_prepared, covid_labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [11]:
tree_covid_predict = tree_reg.predict(covid_data_prepared)
tree_score = r2_score(covid_labels, tree_covid_predict)
tree_score

1.0

In [12]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(covid_data_prepared, covid_labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [13]:
forest_covid_predict = forest_reg.predict(covid_data_prepared)
forest_score = r2_score(covid_labels, forest_covid_predict)
forest_score

0.9980590568685137

In [14]:
from sklearn.model_selection import cross_val_score

# cross-validating - training n-1 folds of the training data and testing on the remaining one. repeat n times for 
# n accuracy scores. 

def display_scores(scores):
    print("Scores: ", scores)
    print("Means: ", scores.mean())
    print("STD: ", scores.std())

tree_scores = cross_val_score(tree_reg, covid_data_prepared, covid_labels, cv=7)
tree_rmse_scores = np.sqrt(tree_scores)    
lin_scores = cross_val_score(lin_reg, covid_data_prepared, covid_labels, cv=7)
lin_rmse_scores = np.sqrt(lin_scores)  
forest_scores = cross_val_score(forest_reg, covid_data_prepared, covid_labels, cv=7)
forest_rmse_scores = np.sqrt(forest_scores)  

In [15]:
display_scores(tree_scores)

Scores:  [0.98725878 0.94982652 0.98177574 0.97690388 0.99211433 0.91048072
 0.98876222]
Means:  0.9695888839490489
STD:  0.027447592450817338


In [16]:
display_scores(lin_scores)

Scores:  [0.8738655  0.88033685 0.89178611 0.32288963 0.88090384 0.8958425
 0.813329  ]
Means:  0.794136205385359
STD:  0.19406821583002826


In [17]:
display_scores(forest_scores)

Scores:  [0.98411536 0.96014643 0.9904315  0.92510573 0.98663898 0.98171722
 0.97581209]
Means:  0.9719953301119226
STD:  0.021210974916185187


In [18]:
from sklearn.model_selection import GridSearchCV

# sklearn will play with different combinations of hyperparameters to fine tune your model for you 

param_grid = [
{'n_estimators': [10, 30, 100], 'max_features': [2, 8, 16]},
{'bootstrap': [False], 'n_estimators': [30, 100], 'max_features': [8, 16]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=7)
grid_search.fit(covid_data_prepared, covid_labels)

GridSearchCV(cv=7, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [19]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 16, 'n_estimators': 30}

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_search = RandomizedSearchCV(estimator = forest_reg, param_distributions = random_grid, n_iter = 50, cv = 7, 
                                   verbose=2, random_state=42, n_jobs = -1)
random_search.fit(covid_data_prepared, covid_labels)

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
cvres = random_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

In [20]:
final_forest = RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [21]:
final_forest.fit(covid_data_prepared, covid_labels)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [10]:
strat_test_set.dropna(subset=["Deaths"], inplace = True)
covid_data_test = strat_test_set.drop("Deaths", axis = 1)
covid_test_labels = strat_test_set["Deaths"].copy()
covid_test_labels

12687        6.0
1012         0.0
15353        6.0
14896        8.0
12042        0.0
2383         1.0
14601      126.0
2559         0.0
9135         4.0
14067        5.0
4522         1.0
4777         0.0
7317      1556.0
778          0.0
11964       18.0
12321      136.0
7978         0.0
9070        15.0
5740         1.0
8774         0.0
3533         0.0
8138         0.0
12979       19.0
3624         0.0
9885       300.0
5705         0.0
16483      559.0
12448       15.0
7497         2.0
4669         0.0
5295         0.0
12705        0.0
6187        12.0
7538         0.0
13688        3.0
6743         6.0
2051         1.0
12769        1.0
16532        7.0
12719        0.0
13458       50.0
4223         0.0
7668         1.0
4927         0.0
12841       67.0
11210       78.0
11293        3.0
14407        1.0
12511     4698.0
2527         0.0
8418         4.0
12438        0.0
15247        1.0
12913        3.0
2743         0.0
652          0.0
5182         1.0
8287        43.0
11186        0

In [15]:
test_data_prepared = full_pipeline.transform(covid_data_test)
test_data_prepared

ValueError: Found unknown categories ['Wuhan Evacuee', 'Hudson County, NJ', 'Jefferson Parish, LA', 'Okaloosa County, FL', 'UK', 'Carver County, MN', 'Johnson County, IA'] in column 1 during transform

In [25]:
forest_final_predict = final_forest.predict(test_data_prepared)
forest_score = r2_score(covid_test_labels, forest_final_predict)
forest_score

ValueError: Number of features of the model must match the input. Model n_features is 592 and input n_features is 535 