In [1]:
# Essentials
import numpy as np
import pandas as pd

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Misc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc 
import missingno as msno

# Warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('/kaggle/input/cais-x-t1-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/cais-x-t1-2021/test.csv')

In [3]:
train_data.shape

(2544, 11)

In [4]:
train_data.head()

Unnamed: 0,Id,# Confirmed_Cases,# Deaths,# Recovered,# Tested,Date,Lat,Long,Population,Province,Testing_Info
0,0,0,,0.0,,2020-02-12,53.9333,-116.5765,4428247.0,Alberta,
1,1,0,,0.0,,2020-02-13,53.9333,-116.5765,4428247.0,Alberta,
2,2,0,,0.0,,2020-02-14,53.9333,-116.5765,4428247.0,Alberta,
3,3,0,,0.0,,2020-02-15,53.9333,-116.5765,4428247.0,Alberta,
4,4,0,,0.0,,2020-02-16,53.9333,-116.5765,4428247.0,Alberta,


In [5]:
train_data.describe()

Unnamed: 0,Id,# Confirmed_Cases,# Deaths,# Recovered,# Tested,Lat,Long,Population
count,2544.0,2544.0,2037.0,2544.0,1946.0,2362.0,2362.0,2362.0
mean,1271.5,48.044418,4.432499,41.961478,2474.023638,54.548859,-92.804513,2923965.0
std,734.533866,147.650063,16.56979,479.256527,5494.004372,7.333226,26.188782,4215885.0
min,0.0,0.0,0.0,0.0,-31531.0,44.682,-135.0,39486.0
25%,635.75,0.0,0.0,0.0,17.0,51.2538,-116.5765,158717.0
50%,1271.5,0.0,0.0,0.0,319.0,53.1355,-86.798981,978274.0
75%,1907.25,12.0,0.0,6.0,1853.0,53.9333,-66.4619,4428247.0
max,2543.0,2203.0,202.0,23686.0,50378.0,70.453262,-57.6604,14745040.0


In [6]:
train_data.isnull().sum()

Id                      0
# Confirmed_Cases       0
# Deaths              507
# Recovered             0
# Tested              598
Date                    0
Lat                   182
Long                  182
Population            182
Province                0
Testing_Info         2536
dtype: int64

In [7]:
train_data.dtypes

Id                     int64
# Confirmed_Cases      int64
# Deaths             float64
# Recovered          float64
# Tested             float64
Date                  object
Lat                  float64
Long                 float64
Population           float64
Province              object
Testing_Info          object
dtype: object

The Date data type is of object and will be read as a string taking away from numerical order so must turn regular object to data object

In [8]:
# change Date format
train_data['Date'] = pd.to_datetime(train_data['Date'], format = '%Y-%m-%d')
test_data['Date'] = pd.to_datetime(train_data['Date'], format = '%Y-%m-%d')

In [9]:
train_data.dtypes

Id                            int64
# Confirmed_Cases             int64
# Deaths                    float64
# Recovered                 float64
# Tested                    float64
Date                 datetime64[ns]
Lat                         float64
Long                        float64
Population                  float64
Province                     object
Testing_Info                 object
dtype: object

Date is now transformed from datetime

In [10]:
train_data['Province'].value_counts()

Province
Alberta          182
BC               182
Manitoba         182
New Brunswick    182
NL               182
Nunavut          182
NWT              182
Ontario          182
PEI              182
Quebec           182
Repatriated      182
Yukon            182
Nova Scotia      180
Saskatchewan     180
Name: count, dtype: int64

# Preprocessing

- Standardization
- One Hot Encodng
- identify outliers(categorical data to numerical)
-->
- What must be learned on outside of this attempt

In [11]:
# work on null values
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)
train_data.isnull().sum()

Id                   0
# Confirmed_Cases    0
# Deaths             0
# Recovered          0
# Tested             0
Date                 0
Lat                  0
Long                 0
Population           0
Province             0
Testing_Info         0
dtype: int64

In [12]:
# Use day of the year for setting up Date column
train_data['Day'] = train_data['Date'].apply(lambda x: x.dayofyear)
test_data['Day'] = test_data['Date'].apply(lambda x: x.dayofyear)
train_data['Day']

0        43
1        44
2        45
3        46
4        47
       ... 
2539    220
2540    221
2541    222
2542    223
2543    224
Name: Day, Length: 2544, dtype: int64

Split data

In [13]:
import datetime
split_date = datetime.datetime(year = 2020, month = 3, day = 19)
print(split_date)

2020-03-19 00:00:00


In [14]:
split = train_data['Date'] < split_date
validation_set = train_data[split]
train_set = train_data[~split]

In [15]:
validation_set.tail()

Unnamed: 0,Id,# Confirmed_Cases,# Deaths,# Recovered,# Tested,Date,Lat,Long,Population,Province,Testing_Info,Day
2393,2393,0,0.0,0.0,0.0,2020-03-14,64.2823,-135.0,41293.0,Yukon,0,74
2394,2394,0,0.0,0.0,0.0,2020-03-15,64.2823,-135.0,41293.0,Yukon,0,75
2395,2395,0,0.0,0.0,0.0,2020-03-16,64.2823,-135.0,41293.0,Yukon,0,76
2396,2396,0,0.0,0.0,0.0,2020-03-17,64.2823,-135.0,41293.0,Yukon,0,77
2397,2397,0,0.0,0.0,101.0,2020-03-18,64.2823,-135.0,41293.0,Yukon,0,78


In [16]:
train_set.tail()

Unnamed: 0,Id,# Confirmed_Cases,# Deaths,# Recovered,# Tested,Date,Lat,Long,Population,Province,Testing_Info,Day
2539,2539,1,0.0,0.0,44.0,2020-08-07,64.2823,-135.0,41293.0,Yukon,0,220
2540,2540,0,0.0,0.0,0.0,2020-08-08,64.2823,-135.0,41293.0,Yukon,0,221
2541,2541,0,0.0,0.0,0.0,2020-08-09,64.2823,-135.0,41293.0,Yukon,0,222
2542,2542,0,0.0,0.0,125.0,2020-08-10,64.2823,-135.0,41293.0,Yukon,0,223
2543,2543,0,0.0,0.0,56.0,2020-08-11,64.2823,-135.0,41293.0,Yukon,0,224


# Using a Decision Tree Regressor Model

In [17]:
# important columns
training_col = ['Day', 'Population', '# Tested', 'Long', 'Lat']
target_col = ['# Deaths', '# Confirmed_Cases', '# Recovered']

In [18]:
from sklearn.tree import DecisionTreeRegressor,plot_tree
model = DecisionTreeRegressor()
model = model.fit(train_set[training_col], train_set[target_col])

In [19]:

validation_prediction = model.predict(validation_set[training_col])
train_prediction = model.predict(train_set[training_col])

test_prediction = model.predict(test_data[training_col])

# creating prediction for submission

In [20]:
column_names = ['ForcastId', '#Deaths', '# Confirmed_cases', '# Recovered']

In [21]:
sub_df = test_data
sub_df[['#Deaths', '# Confirmed_cases', '# Recovered']] = test_prediction
sub_df = sub_df[['#Deaths', '# Confirmed_cases', '# Recovered']]
sub_df['ForcastId'] = sub_df.index
sub_df = sub_df[column_names]

In [22]:
sub_df

Unnamed: 0,ForcastId,#Deaths,# Confirmed_cases,# Recovered
0,0,0.0,0.0,0.0
1,1,0.0,0.0,0.0
2,2,0.0,0.0,0.0
3,3,1.0,77.0,1.0
4,4,1.0,77.0,1.0
...,...,...,...,...
787,787,0.0,1.0,0.0
788,788,0.0,0.0,0.0
789,789,0.0,0.0,1.0
790,790,0.0,0.0,0.0


In [23]:
sub_df.to_csv('prediction.csv', index = False)