In [None]:
# imports 
import numpy as np
import os 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
import math

# suppress warnings (quite prevalent with pandas and numpy)
warnings.simplefilter("ignore")

pd.options.display.max_rows = 1000

# maintain directories well defined
PROJECT_ROOT_DIR = "."
ALL_DATA_DIR = "dat"
DATA_DIR = "novel-covid-data"
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, ALL_DATA_DIR, DATA_DIR)

# global variables - desired columns from dataset
COLS = ["SNo", "ObservationDate", "Province/State", "Country/Region", "Confirmed", "Deaths"]

# function for initialization
def initialize_data(dataset, data_path=DATA_PATH, cols=COLS):
    csv_path = os.path.join(data_path, dataset)
    data = pd.read_csv(csv_path, usecols=cols)
    return data

# link to data - https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset#covid_19_data.csv
# initialize
data = initialize_data("covid_19_data.csv")
# data[data["Country/Region"] == "Mainland China"]

In [None]:
indexes = data[data["Province/State"]=="Recovered"].index
data.drop(indexes, inplace = True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

country_counts = data["Country/Region"].value_counts()
labels = data["Country/Region"].astype('category').cat.categories.tolist()
singles = [i for i in labels if country_counts[i] == 1]
for i in singles:
    indexes = data[data["Country/Region"] == i].index
    data.drop(indexes, inplace = True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Country/Region"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
strat_train_set.dropna(subset=["Deaths"], inplace = True)
covid_data = strat_train_set.drop("Deaths", axis = 1)
covid_labels = strat_train_set["Deaths"].copy()
labels = pd.DataFrame(covid_labels)


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
region_ix, country_ix = 2, 3

class CombineLocations(BaseEstimator, TransformerMixin):
    def __init__(self, add_combined_region = True): 
        self.add_combined_region = add_combined_region
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        pass

In [5]:
obsdate_ix, region_ix = 1, 6
class DaysObserved(BaseEstimator, TransformerMixin):
    def __init__(self, add_days_observed = True): 
        self.add_days_observed = add_days_observed
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        pass


In [None]:
num_attributes = ["Confirmed", "SNo"]
cat_attributes = ["ObservationDate", "Province/State", "Country/Region"]

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    
])

cat_pipeline = Pipeline([
    
])

full_pipeline = ColumnTransformer([
    
    ])

In [None]:
covid_data_prepared = 

In [None]:
covid_data_prepared

In [None]:
from sklearn.linear_model import LinearRegression




In [None]:
from sklearn.metrics import r2_score



In [None]:
from sklearn.tree import DecisionTreeRegressor



In [None]:
from sklearn.ensemble import RandomForestRegressor



In [None]:
from sklearn.model_selection import cross_val_score

# cross-validating - training n-1 folds of the training data and testing on the remaining one. repeat n times for 
# n accuracy scores. 

def display_scores(scores):
    print("Scores: ", scores)
    print("Means: ", scores.mean())
    print("STD: ", scores.std())



In [None]:
display_scores(tree_scores)

In [None]:
display_scores(lin_scores)

In [None]:
display_scores(forest_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

# sklearn will play with different combinations of hyperparameters to fine tune your model for you 

param_grid = [
{'n_estimators': [10, 30, 100], 'max_features': [2, 8, 16]},
{'bootstrap': [False], 'n_estimators': [30, 100], 'max_features': [8, 16]},
]



In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_search = RandomizedSearchCV(estimator = forest_reg, param_distributions = random_grid, n_iter = 50, cv = 7, 
                                   verbose=2, random_state=42, n_jobs = -1)
random_search.fit(covid_data_prepared, covid_labels)

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
cvres = random_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)