# Load Libraries

In [1035]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from datetime import datetime,timedelta
import math
from sklearn.linear_model import LinearRegression
import keras
from keras.models import Sequential
from keras.layers import Dense
import os

In [1036]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Load dataset

In [1037]:
data_set = pd.read_csv('D:/ML/Project-COVID-19/train.csv')
data_set.tail()

Unnamed: 0,Territory X Date,target,cases,Territory,Date
13371,Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
13372,Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
13373,Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
13374,Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
13375,Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


In [1038]:
# checking for shape and size
print(data_set.shape)
print(data_set.size)

(13376, 5)
66880


In [1039]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13376 entries, 0 to 13375
Data columns (total 5 columns):
Territory X Date    13376 non-null object
target              13376 non-null int64
cases               13376 non-null int64
Territory           13376 non-null object
Date                13376 non-null object
dtypes: int64(2), object(3)
memory usage: 522.6+ KB


The dataset has three Categorical features and two Qualitative features

# Visualization

In [1040]:
# # Categorical data
# categorical_feature_mask = data_set.dtypes == object
# categorical_cols = data_set.columns[categorical_feature_mask].tolist()
# categorical_cols

In [1041]:
# Numerical/ continous data
numerical_feature_mask = data_set.dtypes != object
numerical_cols = data_set.columns[numerical_feature_mask].tolist()
numerical_cols

['target', 'cases']

In [1042]:
# Numerical data plotting 

In [1043]:
# sb.set(style='darkgrid')

In [1044]:
# sb.relplot(x = 'cases',y = 'target',data = training_set)

The plot of the relationship between the number of cases and the number of deaths is linear, ie the more the cases the more the number of fatal cases

In [1045]:
# Visualizing the distribution of the Fatalities

In [1046]:
# sb.distplot(training_set['target'])

In [1047]:
# sb.distplot(training_set['target'],kde = False,bins = 5)

In [1048]:
# sb.distplot(training_set['target'],hist = False)

In [1049]:
# sb.pairplot(training_set)

The number of deaths is uniformly distributed

In [1050]:
# Linear relationship

In [1051]:
# sb.regplot(x = 'cases',y = 'target',data = training_set)

# Data Wrangling/ Preprocessing

In [1052]:
data_set.tail()

Unnamed: 0,Territory X Date,target,cases,Territory,Date
13371,Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
13372,Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
13373,Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
13374,Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
13375,Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


In [1053]:
# checking for empty values
data_set.isnull().sum()

Territory X Date    0
target              0
cases               0
Territory           0
Date                0
dtype: int64

There are no missing values or observations in the dataset

In [1054]:
# Checking for duplicates
data_set.duplicated().value_counts()

False    13376
dtype: int64

There are no duplicates in the dataset

In [1055]:
# Setting the first feature as an index
data_set.set_index('Territory X Date',inplace = True)
data_set.tail()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zimbabwe X 3/21/20,0,3,Zimbabwe,3/21/20
Zimbabwe X 3/22/20,0,3,Zimbabwe,3/22/20
Zimbabwe X 3/23/20,1,3,Zimbabwe,3/23/20
Zimbabwe X 3/24/20,1,3,Zimbabwe,3/24/20
Zimbabwe X 3/25/20,1,3,Zimbabwe,3/25/20


Setting the 'Territory X Date' column as the index because it references the observation

In [1056]:
# Converting String data type to datetime using strptime and lambda function
data_set['Date'] = data_set['Date'].apply(lambda _: datetime.strptime(_,"%m/%d/%y"))
data_set.head()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,0,Afghanistan,2020-01-26


In [1057]:
# Splitting the dataset into Training and Testing set
testing_dates = []
start_date = data_set['Date'][-8]
end_date = data_set['Date'][-2]
delta = timedelta(days = 1)
while start_date <= end_date:
    start_date += delta
    testing_dates.append(start_date)
new = data_set['Date'].isin(testing_dates)
testing_data = data_set[new]
testing_data.head()
# testing_data[(testing_data['target'] != 0)]

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 3/19/20,0,22,Afghanistan,2020-03-19
Afghanistan X 3/20/20,0,24,Afghanistan,2020-03-20
Afghanistan X 3/21/20,0,24,Afghanistan,2020-03-21
Afghanistan X 3/22/20,1,40,Afghanistan,2020-03-22
Afghanistan X 3/23/20,1,40,Afghanistan,2020-03-23


The testing data is derived from the last 7 days of the data set, so after a week the testing data will change

In [1058]:
# Dropping the target variable from the testing set
features_test = testing_data.filter(['cases','Territory','Date'])

In [1059]:
features_test.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 3/19/20,22,Afghanistan,2020-03-19
Afghanistan X 3/20/20,24,Afghanistan,2020-03-20
Afghanistan X 3/21/20,24,Afghanistan,2020-03-21
Afghanistan X 3/22/20,40,Afghanistan,2020-03-22
Afghanistan X 3/23/20,40,Afghanistan,2020-03-23


In [1060]:
print(features_test.shape)

(1463, 3)


The filter in-built function filtered the features that will be used to test the model

In [1061]:
# Generating the Training Set
training_data = data_set[new == False]
training_data.head()

Unnamed: 0_level_0,target,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,0,Afghanistan,2020-01-26


In [1062]:
print(training_data.shape)

(11913, 4)


The training data is all data minus the data from the last week 

# Model Selection

In [1063]:
# Splitting Training data into Features and the target

In [1064]:
training_data = training_data[['cases','Territory','Date','target']]
training_data.head()

Unnamed: 0_level_0,cases,Territory,Date,target
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan X 1/22/20,0,Afghanistan,2020-01-22,0
Afghanistan X 1/23/20,0,Afghanistan,2020-01-23,0
Afghanistan X 1/24/20,0,Afghanistan,2020-01-24,0
Afghanistan X 1/25/20,0,Afghanistan,2020-01-25,0
Afghanistan X 1/26/20,0,Afghanistan,2020-01-26,0


Rearranging the columns so that the target can be at the far end

In [1065]:
features_train = training_data.iloc[:,:-1]
target_train = training_data.iloc[:,-1].values
features_train.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 1/22/20,0,Afghanistan,2020-01-22
Afghanistan X 1/23/20,0,Afghanistan,2020-01-23
Afghanistan X 1/24/20,0,Afghanistan,2020-01-24
Afghanistan X 1/25/20,0,Afghanistan,2020-01-25
Afghanistan X 1/26/20,0,Afghanistan,2020-01-26


The Features that will be used to train the model

In [1066]:
# target_train.head().to_frame()

In [1067]:
# Convert Timestamp to String datatype
features_train['Date'] = features_train['Date'].apply(lambda _: datetime.strftime(_,"%m/%d/%y"))
features_test['Date'] = features_test['Date'].apply(lambda _: datetime.strftime(_,"%m/%d/%y"))

In [1068]:
features_train.tail()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Zimbabwe X 3/14/20,0,Zimbabwe,03/14/20
Zimbabwe X 3/15/20,0,Zimbabwe,03/15/20
Zimbabwe X 3/16/20,0,Zimbabwe,03/16/20
Zimbabwe X 3/17/20,0,Zimbabwe,03/17/20
Zimbabwe X 3/18/20,0,Zimbabwe,03/18/20


In [1069]:
features_test.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 3/19/20,22,Afghanistan,03/19/20
Afghanistan X 3/20/20,24,Afghanistan,03/20/20
Afghanistan X 3/21/20,24,Afghanistan,03/21/20
Afghanistan X 3/22/20,40,Afghanistan,03/22/20
Afghanistan X 3/23/20,40,Afghanistan,03/23/20


Converting Timestamp dtype to String dtype to enable scaling of the date feature

In [1070]:
# Handling Categorical Features
# Categorical data
categorical_feature_mask = features_train.dtypes == object
categorical_cols = features_train.columns[categorical_feature_mask].tolist()
categorical_cols

['Territory', 'Date']

In [1071]:
label_encoder = LabelEncoder()
features_train[categorical_cols] = features_train[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
features_test[categorical_cols] = features_test[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))
# features_train = label_encoder.fit_transform(features_train['Date'] )
# features_test = label_encoder.fit_transform(features_train['Date'] )
# one_hot_encoder = LabelBinarizer()
# features_train = one_hot_encoder.fit_transform(features_train['Territory'])
# features_test = one_hot_encoder.fit_transform(features_test['Territory'])

features_train.head()

Unnamed: 0_level_0,cases,Territory,Date
Territory X Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan X 1/22/20,0,0,0
Afghanistan X 1/23/20,0,0,1
Afghanistan X 1/24/20,0,0,2
Afghanistan X 1/25/20,0,0,3
Afghanistan X 1/26/20,0,0,4


In [1072]:
features_test.shape

(1463, 3)

The target/ label / number of deaths to be used to train the model

In [1073]:
target_train = target_train.reshape(-1,1)
print(target_train.shape)

(11913, 1)


In [1074]:
# # Feature Scaling using StandardScaler
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
target_train = scaler.fit_transform(target_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [1075]:
# features_train

In [1076]:
# features_test

In [1077]:
target_train

array([[-0.0702634],
       [-0.0702634],
       [-0.0702634],
       ...,
       [-0.0702634],
       [-0.0702634],
       [-0.0702634]])

StandardScaler scales the numerical variables to a mean of 0 and a standard daviation of 1

In [1078]:
# Create grid search

In [1079]:
# gridsearch = GridSearchCV(estimator = model, param_grid = {'max_depth':range(3,7),'n_estimators':(10,50)},cv = 10,scoring = 'neg_mean_squared_error',verbose = 0, n_jobs = -1)

# # Fit grid search
# best_model = gridsearch.fit(features_train, target_train)
# best_model.best_params_

# Model Evaluation

In [1080]:
# # Create cross-validation
# # model = LinearRegression()
# kf = KFold(n_splits=10, shuffle=True, random_state=1)
# # Conduct k-fold cross-validation
# cv_results = cross_val_score(model,# model
# features_train, # Feature matrix
# target_train, # Target vector # 
# scoring="neg_mean_absolute_error",
# cv = kf,# Loss function                           
# n_jobs=-1) # Use all CPU scores

In [1081]:
# cv_results

In [1082]:
# y_pred = model.predict(features_test).astype(int)
# y_pred

In [1083]:
# metrics.mean_absolute_error(target_train[:1463],y_pred)

In [1084]:
# metrics.r2_score(target_train[:1463],y_pred)

In [1085]:
# rmse = math.sqrt(metrics.mean_squared_error(target_train[:1463],y_pred))
# rmse

In [1086]:
# features_test.reset_index(inplace = True)

In [1087]:
# Territory_X_Date = features_test['Territory X Date']
# submission = pd.DataFrame({'Territory X Date': Territory_X_Date,'Target': y_pred})
# submission.head()

In [1088]:
# submission[(submission['Target'] != 0)]

# Artificial Neural Network

In [1089]:
# intitializing the ANN(Artificial Neural Netwotk)
classifier = Sequential()

In [1090]:
# adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 3, init = 'uniform', activation = 'relu', input_dim = 3))

  


In [1091]:
# adding the  second hidden layer
classifier.add(Dense(output_dim = 3, init = 'uniform', activation = 'relu'))

  


In [1092]:
# adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'relu'))

  


In [1093]:
#  Compiling the ANN
classifier.compile(optimizer= 'RMSprop', loss = 'mse', metrics= ['mse'])

In [1094]:
# Fitting ANN to the Training set
classifier.fit(features_train,target_train, batch_size= 10, epochs = 10,verbose=0)

<keras.callbacks.History at 0x17175837828>

In [1095]:
y_pred = classifier.predict(features_test)
y_pred

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [1096]:
metrics.mean_absolute_error(target_train[:1463],y_pred)

0.06955790790545581

In [1097]:
metrics.r2_score(target_train[:1463],y_pred)

-249.10740515622635

In [1098]:
rmse = math.sqrt(metrics.mean_squared_error(target_train[:1463],y_pred))
rmse

0.06967321732554804

In [1099]:
target_train

array([[-0.0702634],
       [-0.0702634],
       [-0.0702634],
       ...,
       [-0.0702634],
       [-0.0702634],
       [-0.0702634]])