# Load Libraries

In [162]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from datetime import datetime,timedelta
import math
from sklearn.linear_model import LinearRegression

# Load dataset

In [163]:
data_set = pd.read_csv('D:/ML/Project-COVID-19/train.csv')
data_set.tail()

Unnamed: 0,Territory X Date,target,cases,Territory,Date
13371,Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
13372,Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
13373,Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
13374,Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
13375,Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


In [164]:
# checking for shape and size
print(data_set.shape)
print(data_set.size)

(13376, 5)
66880


In [165]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13376 entries, 0 to 13375
Data columns (total 5 columns):
Territory X Date    13376 non-null object
target              13376 non-null int64
cases               13376 non-null int64
Territory           13376 non-null object
Date                13376 non-null object
dtypes: int64(2), object(3)
memory usage: 522.6+ KB


The dataset has three Categorical features and two Qualitative features

# Visualization

In [166]:
# # Categorical data
# categorical_feature_mask = data_set.dtypes == object
# categorical_cols = data_set.columns[categorical_feature_mask].tolist()
# categorical_cols

In [167]:
# Numerical/ continous data
numerical_feature_mask = data_set.dtypes != object
numerical_cols = data_set.columns[numerical_feature_mask].tolist()
numerical_cols

['target', 'cases']

In [168]:
# Numerical data plotting 

In [169]:
# sb.set(style='darkgrid')

In [170]:
# sb.relplot(x = 'cases',y = 'target',data = training_set)

The plot of the relationship between the number of cases and the number of deaths is linear, ie the more the cases the more the number of fatal cases

In [171]:
# Visualizing the distribution of the Fatalities

In [172]:
# sb.distplot(training_set['target'])

In [173]:
# sb.distplot(training_set['target'],kde = False,bins = 5)

In [174]:
# sb.distplot(training_set['target'],hist = False)

In [175]:
# sb.pairplot(training_set)

The number of deaths is uniformly distributed

In [176]:
# Linear relationship

In [177]:
# sb.regplot(x = 'cases',y = 'target',data = training_set)

# Data Wrangling/ Preprocessing

In [178]:
data_set.tail()

Unnamed: 0,Territory X Date,target,cases,Territory,Date
13371,Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
13372,Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
13373,Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
13374,Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
13375,Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


In [179]:
# checking for empty values
data_set.isnull().sum()

Territory X Date    0
target              0
cases               0
Territory           0
Date                0
dtype: int64

There are no missing values or observations in the dataset

In [180]:
# Checking for duplicates
data_set.duplicated().value_counts()

False    13376
dtype: int64

There are no duplicates in the dataset

In [181]:
# Setting the first feature as an index
data_set.set_index('Territory X Date',inplace = True)
data_set.tail()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


Setting the 'Territory X Date' column as the index because it references the observation

In [182]:
# Converting String data type to datetime using strptime and lambda function
data_set['Date'] = data_set['Date'].apply(lambda _: datetime.strptime(_,"%m/%d/%y"))
data_set.head()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,0,Afghanistan,2020-01-26


In [183]:
# Splitting the dataset into Training and Testing set
testing_dates = []
start_date = data_set['Date'][-8]
end_date = data_set['Date'][-2]
delta = timedelta(days = 1)
while start_date <= end_date:
    start_date += delta
    testing_dates.append(start_date)
new = data_set['Date'].isin(testing_dates)
testing_data = data_set[new]
testing_data.head()
# testing_data[(testing_data['target'] != 0)]

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 3/19/20,0,22,Afghanistan,2020-03-19
Afghanistan X 3/20/20,0,24,Afghanistan,2020-03-20
Afghanistan X 3/21/20,0,24,Afghanistan,2020-03-21
Afghanistan X 3/22/20,1,40,Afghanistan,2020-03-22
Afghanistan X 3/23/20,1,40,Afghanistan,2020-03-23


The testing data is derived from the last 7 days of the data set, so after a week the testing data will change

In [184]:
# Dropping the target variable from the testing set
features_test = testing_data.filter(['cases','Territory','Date'])

In [185]:
features_test.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 3/19/20,22,Afghanistan,2020-03-19
Afghanistan X 3/20/20,24,Afghanistan,2020-03-20
Afghanistan X 3/21/20,24,Afghanistan,2020-03-21
Afghanistan X 3/22/20,40,Afghanistan,2020-03-22
Afghanistan X 3/23/20,40,Afghanistan,2020-03-23


In [186]:
print(features_test.shape)

(1463, 3)


The filter in-built function filtered the features that will be used to test the model

In [187]:
# Generating the Training Set
training_data = data_set[new == False]
training_data.head()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,0,Afghanistan,2020-01-26


In [188]:
print(training_data.shape)

(11913, 4)


The training data is all data minus the data from the last week 

# Model Selection

In [189]:
# Splitting Training data into Features and the target

In [190]:
training_data = training_data[['cases','Territory','Date','target']]
training_data.head()

Unnamed: 0_level_0,cases,Territory,Date,target
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,Afghanistan,2020-01-22,0
Afghanistan X 1/23/20,0,Afghanistan,2020-01-23,0
Afghanistan X 1/24/20,0,Afghanistan,2020-01-24,0
Afghanistan X 1/25/20,0,Afghanistan,2020-01-25,0
Afghanistan X 1/26/20,0,Afghanistan,2020-01-26,0


Rearranging the columns so that the target can be at the far end

In [191]:
features_train = training_data.iloc[:,:-1]
target_train = training_data.iloc[:,-1].values
features_train.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 1/22/20,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,Afghanistan,2020-01-26


The Features that will be used to train the model

In [192]:
# target_train.head().to_frame()

In [193]:
# Convert Timestamp to String datatype
features_train['Date'] = features_train['Date'].apply(lambda _: datetime.strftime(_,"%m/%d/%y"))
features_test['Date'] = features_test['Date'].apply(lambda _: datetime.strftime(_,"%m/%d/%y"))

In [194]:
features_train.tail()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Zimbabwe X 3/14/20,0,Zimbabwe,03/14/20
Zimbabwe X 3/15/20,0,Zimbabwe,03/15/20
Zimbabwe X 3/16/20,0,Zimbabwe,03/16/20
Zimbabwe X 3/17/20,0,Zimbabwe,03/17/20
Zimbabwe X 3/18/20,0,Zimbabwe,03/18/20


In [195]:
features_test.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 3/19/20,22,Afghanistan,03/19/20
Afghanistan X 3/20/20,24,Afghanistan,03/20/20
Afghanistan X 3/21/20,24,Afghanistan,03/21/20
Afghanistan X 3/22/20,40,Afghanistan,03/22/20
Afghanistan X 3/23/20,40,Afghanistan,03/23/20


Converting Timestamp dtype to String dtype to enable scaling of the date feature

In [196]:
# Handling Categorical Features
# Categorical data
categorical_feature_mask = features_train.dtypes == object
categorical_cols = features_train.columns[categorical_feature_mask].tolist()
categorical_cols

['Territory', 'Date']

In [197]:
label_encoder = LabelEncoder()
features_train[categorical_cols] = features_train[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
features_test[categorical_cols] = features_test[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
# features_train = label_encoder.fit_transform(features_train['Date'] )
# features_test = label_encoder.fit_transform(features_train['Date'] )
# one_hot_encoder = LabelBinarizer()
# features_train = one_hot_encoder.fit_transform(features_train['Territory'])
# features_test = one_hot_encoder.fit_transform(features_test['Territory'])

features_train.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 1/22/20,0,0,0
Afghanistan X 1/23/20,0,0,1
Afghanistan X 1/24/20,0,0,2
Afghanistan X 1/25/20,0,0,3
Afghanistan X 1/26/20,0,0,4


In [198]:
features_test.shape

(1463, 3)

The target/ label / number of deaths to be used to train the model

In [199]:
target_train = target_train.reshape(-1,1)
print(target_train.shape)

(11913, 1)


In [200]:
# Feature Scaling using StandardScaler
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
target_train = scaler.fit_transform(target_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [201]:
features_train

array([[-0.07636999, -1.72378321, -1.70192589],
       [-0.07636999, -1.72378321, -1.64114282],
       [-0.07636999, -1.72378321, -1.58035975],
       ...,
       [-0.07636999,  1.72378321,  1.58035975],
       [-0.07636999,  1.72378321,  1.64114282],
       [-0.07636999,  1.72378321,  1.70192589]])

In [202]:
features_test

array([[-0.07138832, -1.72378321, -1.70192589],
       [-0.07093544, -1.72378321, -1.64114282],
       [-0.07093544, -1.72378321, -1.58035975],
       ...,
       [-0.07569067,  1.72378321, -1.45879362],
       [-0.07569067,  1.72378321, -1.39801055],
       [-0.07569067,  1.72378321, -1.33722748]])

In [203]:
target_train

array([[-0.0702634],
       [-0.0702634],
       [-0.0702634],
       ...,
       [-0.0702634],
       [-0.0702634],
       [-0.0702634]])

StandardScaler scales the numerical variables to a mean of 0 and a standard daviation of 1

In [204]:
# create Random Forest Regressor
model = RandomForestRegressor()
model.fit(features_train,target_train)
# # create Linear Regressor
# model = LinearRegression()
# model.fit(features_train,target_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [205]:
# Create grid search

In [206]:
# gridsearch = GridSearchCV(estimator = model, param_grid = {'max_depth':range(3,7),'n_estimators':(10,50)},cv = 10,scoring = 'neg_mean_squared_error',verbose = 0, n_jobs = -1)

# # Fit grid search
# best_model = gridsearch.fit(features_train, target_train)
# best_model.best_params_

# Model Evaluation

In [207]:
# Create cross-validation
# model = LinearRegression()
kf = KFold(n_splits=10, shuffle=True, random_state=1)
# Conduct k-fold cross-validation
cv_results = cross_val_score(model,# model
features_train, # Feature matrix
target_train, # Target vector # 
scoring="neg_mean_absolute_error",
cv = kf,# Loss function                           
n_jobs=-1) # Use all CPU scores

In [208]:
cv_results

array([-0.00554474, -0.00243841, -0.00346046, -0.00458286, -0.00382068,
       -0.01013423, -0.00661789, -0.00918875, -0.00719675, -0.00540362])

In [209]:
y_pred = model.predict(features_test)
y_pred

array([-0.06963551, -0.06586816, -0.06586816, ..., -0.0702634 ,
       -0.0702634 , -0.0702634 ])

In [210]:
metrics.mean_absolute_error(target_train[:1463],y_pred)

0.2626185850183685

In [211]:
metrics.r2_score(target_train[:1463],y_pred)

-90738.41701489603

In [212]:
rmse = math.sqrt(metrics.mean_squared_error(target_train[:1463],y_pred))
rmse

1.3270906295414728

In [213]:
# features_test.reset_index(inplace = True)

In [214]:
# Territory_X_Date = features_test['Territory X Date']
# submission = pd.DataFrame({'Territory X Date': Territory_X_Date,'Target': y_pred})
# submission.head()

In [215]:
# submission[(submission['Target'] != 0)]

In [216]:
model.score(features_test,y_pred)

1.0

In [217]:
target_train

array([[-0.0702634],
       [-0.0702634],
       [-0.0702634],
       ...,
       [-0.0702634],
       [-0.0702634],
       [-0.0702634]])