# Covid 19 - predictions for Active cases

### Loading libraries

In [1]:
import pandas as pd
import pandas as pd 
import random

In [2]:
import math
import time
import datetime
import operator 
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

In [3]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
plt.style.use('seaborn-dark')
#print(plt.style.available)
%matplotlib inline

from sklearn import metrics

In [5]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV

In [6]:
import warnings
warnings.filterwarnings("ignore")

### Preparation for predictions

#### Data Loading

In [39]:
#dataset
covid_df= pd.read_csv("corona_pred_china.csv")
covid_df.head()

Unnamed: 0,ObservationDate,Country,Continent,Confirmed,Deaths,Recovered,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,GDP ($ per capita)
0,2020-01-22,Hong Kong,Asia,0,0,0,0,6940432,1092,63557,524,28800
1,2020-01-22,Japan,Asia,2,0,0,2,127463611,377835,3374,0,28200
2,2020-01-22,Macao,Asia,1,0,0,1,453125,28,161830,486,19400
3,2020-01-22,South Korea,Asia,1,0,0,1,48846823,98480,4960,0,17800
4,2020-01-22,Taiwan,Asia,1,0,0,1,23036087,35980,6403,0,23400


#### Data transformations

In [40]:
#Converting "Observation Date" into Datetime format
covid_df["ObservationDate"]=pd.to_datetime(covid_df["ObservationDate"])
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3811 entries, 0 to 3810
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   ObservationDate             3811 non-null   datetime64[ns]
 1   Country                     3811 non-null   object        
 2   Continent                   3811 non-null   object        
 3   Confirmed                   3811 non-null   float64       
 4   Deaths                      3811 non-null   float64       
 5   Recovered                   3811 non-null   float64       
 6   Active Cases                3811 non-null   float64       
 7   Population                  3811 non-null   float64       
 8   Area (sq. mi.)              3811 non-null   float64       
 9   Pop. Density (per sq. mi.)  3811 non-null   object        
 10  Net migration               3811 non-null   object        
 11  GDP ($ per capita)          3811 non-null   float64     

In [41]:
# Changing datatype of Population density and Net Migration columns as float
cols=["Pop. Density (per sq. mi.)","Net migration"]                             
for i in cols:
    covid_df[i]=covid_df[i].str.replace(",","").astype(float)
covid_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3811 entries, 0 to 3810
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   ObservationDate             3811 non-null   datetime64[ns]
 1   Country                     3811 non-null   object        
 2   Continent                   3811 non-null   object        
 3   Confirmed                   3811 non-null   float64       
 4   Deaths                      3811 non-null   float64       
 5   Recovered                   3811 non-null   float64       
 6   Active Cases                3811 non-null   float64       
 7   Population                  3811 non-null   float64       
 8   Area (sq. mi.)              3811 non-null   float64       
 9   Pop. Density (per sq. mi.)  3811 non-null   float64       
 10  Net migration               3811 non-null   float64       
 11  GDP ($ per capita)          3811 non-null   float64     

In [42]:
covid_df.head()

Unnamed: 0,ObservationDate,Country,Continent,Confirmed,Deaths,Recovered,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,GDP ($ per capita)
0,2020-01-22,Hong Kong,Asia,0,0,0,0,6940432,1092,63557,524,28800
1,2020-01-22,Japan,Asia,2,0,0,2,127463611,377835,3374,0,28200
2,2020-01-22,Macao,Asia,1,0,0,1,453125,28,161830,486,19400
3,2020-01-22,South Korea,Asia,1,0,0,1,48846823,98480,4960,0,17800
4,2020-01-22,Taiwan,Asia,1,0,0,1,23036087,35980,6403,0,23400


In [43]:
# Sorting based on the country names
covid_df = covid_df.sort_values(["Country", "ObservationDate"]).reset_index(drop=True)

In [44]:
covid_df.shape

(3811, 12)

In [45]:
# Calculating days since the first occurance for each country
covid_df['days_since_first']=0
covid_df['previous_Active'] = 0
covid_df['active_rate'] = 0
previous_Active=0
confirm_difference=0

for i in covid_df.index[1:]:
    if covid_df['Country'][i]==covid_df['Country'][i-1]:
        covid_df.loc[i,'days_since_first']=(covid_df.at[i-1,'days_since_first']+1)
        previous_Active=covid_df.at[i-1,'Active Cases']
        confirm_difference= covid_df.at[i,'Active Cases']-previous_Active
        covid_df.loc[i,'previous_Active']=previous_Active        
        if ((previous_Active!=0) & (confirm_difference>0) ):
            covid_df.loc[i,'active_rate']=((confirm_difference/previous_Active))       
        else:
            covid_df.loc[i,'active_rate']=0
    else:
        covid_df.loc[i,'days_since_first'] = 0
        covid_df.loc[i,'previous_Active'] = 0
        covid_df.loc[i,'active_rate'] = 0  

In [46]:
# Ensuring that previous Active Cases column type is int
covid_df['previous_Active'].astype(int)
covid_df=covid_df[covid_df['active_rate']>=0]

In [47]:
covid_df.set_index('Country',inplace=True)

#### Check for highest number of Active Cases cases in Europe

In [48]:
covid_df_Europe = covid_df[covid_df['Continent'] == "Europe"].groupby('Country').agg(
                {
                     'Active Cases':sum,    # Sum duration per group
                     'days_since_first': max,  # get the max of days
                }).sort_values(["Active Cases", "days_since_first"]).tail()
covid_df_Europe.sort_values(["Active Cases"], ascending=False)

Unnamed: 0_level_0,Active Cases,days_since_first
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Italy,360932,51
Spain,138294,50
Germany,134143,54
France,96474,58
Switzerland,38533,26


In [49]:
covid_df.columns

Index(['ObservationDate', 'Continent', 'Confirmed', 'Deaths', 'Recovered',
       'Active Cases', 'Population', 'Area (sq. mi.)',
       'Pop. Density (per sq. mi.)', 'Net migration', 'GDP ($ per capita)',
       'days_since_first', 'previous_Active', 'active_rate'],
      dtype='object')

### Random Forest Regressor

In [50]:
X=covid_df.copy()
del X['Continent']
del X['ObservationDate']
del X['Confirmed']
del X['Deaths']
del X['Recovered']
del X['GDP ($ per capita)']
#del X['previous_Activecases']
#del X['previous_active']
#del X['previous_Active']



In [51]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3811 entries, Afghanistan to Zimbabwe
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Active Cases                3811 non-null   float64
 1   Population                  3811 non-null   float64
 2   Area (sq. mi.)              3811 non-null   float64
 3   Pop. Density (per sq. mi.)  3811 non-null   float64
 4   Net migration               3811 non-null   float64
 5   days_since_first            3811 non-null   int64  
 6   previous_Active             3811 non-null   float64
 7   active_rate                 3811 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 268.0+ KB


In [52]:
X["Net migration"].fillna(0, inplace=True)
X["Pop. Density (per sq. mi.)"].fillna(0, inplace=True)

In [53]:
X.isnull().any()

Active Cases                  False
Population                    False
Area (sq. mi.)                False
Pop. Density (per sq. mi.)    False
Net migration                 False
days_since_first              False
previous_Active               False
active_rate                   False
dtype: bool

In [54]:
corr = X.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,days_since_first,previous_Active,active_rate
Active Cases,1.0,0.03,0.02,-0.03,0.02,0.29,1.0,-0.02
Population,0.03,1.0,0.3,-0.07,-0.06,0.15,0.03,-0.01
Area (sq. mi.),0.02,0.3,1.0,-0.1,0.08,0.16,0.01,-0.02
Pop. Density (per sq. mi.),-0.03,-0.07,-0.1,1.0,0.29,0.14,-0.03,-0.04
Net migration,0.02,-0.06,0.08,0.29,1.0,0.13,0.01,-0.01
days_since_first,0.29,0.15,0.16,0.14,0.13,1.0,0.29,-0.04
previous_Active,1.0,0.03,0.01,-0.03,0.01,0.29,1.0,-0.02
active_rate,-0.02,-0.01,-0.02,-0.04,-0.01,-0.04,-0.02,1.0


In [55]:
# Pick only features that are correlated more than 0.2
y = X['Active Cases']     

In [56]:
print(X.shape)
X.head()

(3811, 8)


Unnamed: 0_level_0,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,days_since_first,previous_Active,active_rate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,1,31056997,647500,480,2306,0,0,0
Afghanistan,1,31056997,647500,480,2306,1,1,0
Afghanistan,1,31056997,647500,480,2306,2,1,0
Afghanistan,1,31056997,647500,480,2306,3,1,0
Afghanistan,1,31056997,647500,480,2306,4,1,0


In [57]:
# To get rid of +e format
pd.options.display.float_format = '{:20,.0f}'.format

#### Train Test Split

In [58]:
X.drop(['Active Cases'], axis=1, inplace=True)


# Break off validation set from training data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, test_size=0.15, random_state=1)

In [59]:
print(f"X_train shape is {X_train.shape}")
print(f"X_test shape is {X_test.shape}")
# print(f"X_validate shape is {X_validate.shape}")
print(f"y_train is {y_train.shape}")
# print(f"y_validate shape is {y_validate.shape}")
print(f"y_test shape is {y_test.shape}")

X_train shape is (3239, 7)
X_test shape is (572, 7)
y_train is (3239,)
y_test shape is (572,)


## Random Forest

#### Defining the model and checking the best score

In [60]:
#DEFINE YOUR REGRESSOR and THE PARAMETERS GRID

regressor = RandomForestRegressor()
parameters = {"criterion": ['mse'], 
              "n_estimators":[10, 100, 400], 
              "min_samples_leaf": [0.1, 1, 5, 10, 20, 50], 
              "random_state" : [1]
            #  "max_features": "auto"
            # "oob_score ": TRUE
             }

In [61]:
#DEFINE YOUR GRIDSEARCH 
gs = GridSearchCV(regressor, parameters, cv=3) #with no params it reduces to a CV

gs = gs.fit(X_train.to_numpy(),y_train.to_numpy())

In [62]:
#summarize the results of your GRIDSEARCH
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']


***GRIDSEARCH RESULTS***
Best score: 0.985168 using {'criterion': 'mse', 'min_samples_leaf': 1, 'n_estimators': 400, 'random_state': 1}


In [63]:
columns = np.asarray(X.columns)
#columns = np.asarray([columns])
from sklearn import tree
r = tree.export_text(gs.best_estimator_.estimators_[0],feature_names=columns.tolist(), max_depth=3)
print(r)

|--- previous_Active <= 7533.00
|   |--- previous_Active <= 2507.00
|   |   |--- previous_Active <= 571.00
|   |   |   |--- previous_Active <= 177.50
|   |   |   |   |--- truncated branch of depth 20
|   |   |   |--- previous_Active >  177.50
|   |   |   |   |--- truncated branch of depth 13
|   |   |--- previous_Active >  571.00
|   |   |   |--- previous_Active <= 1128.50
|   |   |   |   |--- truncated branch of depth 11
|   |   |   |--- previous_Active >  1128.50
|   |   |   |   |--- truncated branch of depth 10
|   |--- previous_Active >  2507.00
|   |   |--- previous_Active <= 4983.50
|   |   |   |--- previous_Active <= 3746.00
|   |   |   |   |--- truncated branch of depth 5
|   |   |   |--- previous_Active >  3746.00
|   |   |   |   |--- truncated branch of depth 4
|   |   |--- previous_Active >  4983.50
|   |   |   |--- previous_Active <= 5641.50
|   |   |   |   |--- truncated branch of depth 3
|   |   |   |--- previous_Active >  5641.50
|   |   |   |   |--- truncated branch of 

####  Training the model

In [64]:
#we establish the model with the best estimator according to Gridsearch results
gs = gs.best_estimator_ 

gs = gs.fit(X_train.to_numpy(),y_train.to_numpy())

In [65]:
#Returns the coefficient of determination R^2 of the prediction.
#Explained variance score: 1 is perfect prediction
gs.score(X_test.to_numpy(), y_test.to_numpy())

0.9954757370199203

#### Testing the model

In [66]:
y_pred = gs.predict(X_test.to_numpy())

In [67]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred}).reset_index(drop=True)
df

Unnamed: 0,Actual,Predicted
0,127,131
1,1,1
2,108,94
3,0,0
4,9,9
...,...,...
567,1,1
568,53,56
569,4,4
570,0,0


In [68]:
# It is not giving correct results yet. Need to check
df=df[df["Actual"] != df["Predicted"]]
df

Unnamed: 0,Actual,Predicted
0,127,131
2,108,94
4,9,9
5,186,155
6,19,18
...,...,...
566,531,540
567,1,1
568,53,56
570,0,0


#### Evaluation results

In [69]:
print("MAE train: ", metrics.mean_absolute_error(y_train.to_numpy(), gs.predict(X_train.to_numpy()))) 
print("MSE train: ",metrics.mean_squared_error(y_train.to_numpy(), gs.predict(X_train.to_numpy())))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train.to_numpy(), gs.predict(X_train.to_numpy()))))
print("r2: ",np.sqrt(metrics.r2_score(y_train.to_numpy(), gs.predict(X_train.to_numpy()))))

print("MAE test: ", metrics.mean_absolute_error(y_test.to_numpy(), gs.predict(X_test.to_numpy()))) 
print("MSE test: ",metrics.mean_squared_error(y_test.to_numpy(), gs.predict(X_test.to_numpy())))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test.to_numpy(), gs.predict(X_test.to_numpy()))))
print("r2: ",np.sqrt(metrics.r2_score(y_test.to_numpy(), gs.predict(X_test.to_numpy()))))

MAE train:  11.839063715554454
MSE train:  11072.336858862325
RMSE train:  105.225172173118
r2:  0.999056209294852
MAE test:  11.898483671867899
MSE test:  4714.946679871738
RMSE test:  68.66546934137811
r2:  0.9977353040861691


## Random Forest Regression -  For different countries

### Using Kaggle dataset to cross check the results of future prediction 

In [70]:
covid_df_kaggle= pd.read_csv("covid_19_data_Kaggle.csv")

In [71]:
covid_df_kaggle["ObservationDate"]=pd.to_datetime(covid_df_kaggle["ObservationDate"])

In [72]:
covid_df_kaggle.shape

(13850, 8)

### Random Forest Regression - Italy

In [73]:
covid_df_kaggle_Italy = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Italy"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [74]:
covid_df_kaggle_Italy["Active Cases"] = covid_df_kaggle_Italy['Confirmed']-(covid_df_kaggle_Italy['Recovered'] + covid_df_kaggle_Italy['Deaths'])
covid_df_kaggle_Italy

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths,Active Cases
0,2020-01-31,2,0,0,2
1,2020-02-01,2,0,0,2
2,2020-02-02,2,0,0,2
3,2020-02-03,2,0,0,2
4,2020-02-04,2,0,0,2
...,...,...,...,...,...
66,2020-04-06,132547,22837,16523,93187
67,2020-04-07,135586,24392,17127,94067
68,2020-04-08,139422,26491,17669,95262
69,2020-04-09,143626,28470,18279,96877


In [75]:
filter_condn = (covid_df_kaggle_Italy['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Italy['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Italy = covid_df_kaggle_Italy[filter_condn].reset_index()
covid_df_kaggle_Italy

X_Italy_df = X[X.index == "Italy"].copy()
X_Italy_test = X_Italy_df.head().copy()
X_Italy_test=X_Italy_test.reset_index(drop=True)
X_Italy_test['days_since_first'] = X_Italy_test['previous_Active'] = X_Italy_test['active_rate'] = 0
X_Italy_test.loc[0,'days_since_first'] =  X_Italy_df["days_since_first"].iloc[-1]+1
X_Italy_test.loc[0,'active_rate']=X_Italy_df["active_rate"].iloc[-1]+1
X_Italy_test.loc[0,'previous_Active'] = y[y.index == "Italy"].max()

for i in range(0,5):
    X_Italy_test.loc[i,'days_since_first'] = X_Italy_test.loc[0,'days_since_first'] + i

X_Italy_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Italy_test.iloc[i].to_numpy()]))
    X_Italy_test.loc[i+1,'previous_Active']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Italy_test['predicted_infections']=pred.iloc[:, 0] 
X_Italy_test=X_Italy_test[{'predicted_infections'}]
# Creating new column to have actual predictions and date from Kaggle dataset
X_Italy_test["Actual infections"] = covid_df_kaggle_Italy["Active Cases"]
X_Italy_test["Date"] = covid_df_kaggle_Italy["ObservationDate"]
X_Italy_test = X_Italy_test.iloc[0:5]
X_Italy_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,41479,46638,2020-03-22
1,44216,50418,2020-03-23
2,44216,54030,2020-03-24
3,44215,57521,2020-03-25
4,44120,62013,2020-03-26


### Random Forest Regression - France

In [76]:
covid_df_kaggle_France = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "France"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()
covid_df_kaggle_France["Active Cases"] = covid_df_kaggle_France['Confirmed']-(covid_df_kaggle_France['Recovered'] + covid_df_kaggle_France['Deaths'])
covid_df_kaggle_France

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths,Active Cases
0,2020-01-24,2,0,0,2
1,2020-01-25,3,0,0,3
2,2020-01-26,3,0,0,3
3,2020-01-27,3,0,0,3
4,2020-01-28,4,0,0,4
...,...,...,...,...,...
73,2020-04-06,98963,17428,8926,72609
74,2020-04-07,110065,19523,10343,80199
75,2020-04-08,113959,21452,10887,81620
76,2020-04-09,118781,23413,12228,83140


In [77]:
filter_condn = (covid_df_kaggle_France['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_France['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_France = covid_df_kaggle_France[filter_condn].reset_index()
covid_df_kaggle_France

X_France_df = X[X.index == "France"].copy()
X_France_test = X_France_df.head().copy()
X_France_test=X_France_test.reset_index(drop=True)
X_France_test['days_since_first'] = X_France_test['previous_Active'] = X_France_test['active_rate'] = 0
X_France_test.loc[0,'days_since_first'] =  X_France_df["days_since_first"].iloc[-1]+1
X_France_test.loc[0,'active_rate']=X_France_df["active_rate"].iloc[-1]+1
X_France_test.loc[0,'previous_Active'] = y[y.index == "France"].max()

for i in range(0,5):
    X_France_test.loc[i,'days_since_first'] = X_France_test.loc[0,'days_since_first'] + i

X_France_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_France_test.iloc[i].to_numpy()]))
    X_France_test.loc[i+1,'previous_Active']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_France_test['predicted_infections']=pred.iloc[:, 0] 
X_France_test=X_France_test[{'predicted_infections'}]
X_France_test["Actual infections"] = covid_df_kaggle_France["Active Cases"]
X_France_test["Date"] = covid_df_kaggle_France["ObservationDate"]
X_France_test = X_France_test.iloc[0:5]
X_France_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,15983,13337,2020-03-22
1,17240,17054,2020-03-23
2,21144,18232,2020-03-24
3,24899,20360,2020-03-25
4,31650,22898,2020-03-26


### Random Forest Regression - Spain

In [78]:
covid_df_kaggle_Spain = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Spain"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()
covid_df_kaggle_Spain["Active Cases"] = covid_df_kaggle_Spain['Confirmed']-(covid_df_kaggle_Spain['Recovered'] + covid_df_kaggle_Spain['Deaths'])
covid_df_kaggle_Spain

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths,Active Cases
0,2020-02-01,1,0,0,1
1,2020-02-02,1,0,0,1
2,2020-02-03,1,0,0,1
3,2020-02-04,1,0,0,1
4,2020-02-05,1,0,0,1
...,...,...,...,...,...
65,2020-04-06,136675,40437,13341,82897
66,2020-04-07,141942,43208,14045,84689
67,2020-04-08,148220,48021,14792,85407
68,2020-04-09,153222,52165,15447,85610


In [79]:
filter_condn = (covid_df_kaggle_Spain['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Spain['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Spain = covid_df_kaggle_Spain[filter_condn].reset_index()
covid_df_kaggle_Spain

X_Spain_df = X[X.index == "Spain"].copy()
X_Spain_test = X_Spain_df.head().copy()
X_Spain_test=X_Spain_test.reset_index(drop=True)
X_Spain_test['days_since_first'] = X_Spain_test['previous_Active'] = X_Spain_test['active_rate'] = 0
X_Spain_test.loc[0,'days_since_first'] =  X_Spain_df["days_since_first"].iloc[-1]+1
X_Spain_test.loc[0,'active_rate']=X_Spain_df["active_rate"].iloc[-1]+1
X_Spain_test.loc[0,'previous_Active'] = y[y.index == "Spain"].max()

for i in range(0,5):
    X_Spain_test.loc[i,'days_since_first'] = X_Spain_test.loc[0,'days_since_first'] + i

X_Spain_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Spain_test.iloc[i].to_numpy()]))
    X_Spain_test.loc[i+1,'previous_Active']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Spain_test['predicted_infections']=pred.iloc[:, 0] 
X_Spain_test=X_Spain_test[{'predicted_infections'}]
X_Spain_test["Actual infections"] = covid_df_kaggle_Spain["Active Cases"]
X_Spain_test["Date"] = covid_df_kaggle_Spain["ObservationDate"]
X_Spain_test = X_Spain_test.iloc[0:5]
X_Spain_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,30231,24722,2020-03-22
1,37186,29470,2020-03-23
2,43248,33283,2020-03-24
3,44176,40501,2020-03-25
4,44175,46406,2020-03-26


### Random Forest Regression - Germany

In [80]:
covid_df_kaggle_Germany = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Germany"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()
covid_df_kaggle_Germany["Active Cases"] = covid_df_kaggle_Germany['Confirmed']-(covid_df_kaggle_Germany['Recovered'] + covid_df_kaggle_Germany['Deaths'])
covid_df_kaggle_Germany

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths,Active Cases
0,2020-01-28,4,0,0,4
1,2020-01-29,4,0,0,4
2,2020-01-30,4,0,0,4
3,2020-01-31,5,0,0,5
4,2020-02-01,8,0,0,8
...,...,...,...,...,...
69,2020-04-06,103374,28700,1810,72864
70,2020-04-07,107663,36081,2016,69566
71,2020-04-08,113296,46300,2349,64647
72,2020-04-09,118181,52407,2607,63167


In [81]:
filter_condn = (covid_df_kaggle_Germany['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Germany['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Germany = covid_df_kaggle_Germany[filter_condn].reset_index()
covid_df_kaggle_Germany

X_Germany_df = X[X.index == "Germany"].copy()
X_Germany_test = X_Germany_df.head().copy()
X_Germany_test=X_Germany_test.reset_index(drop=True)
X_Germany_test['days_since_first'] = X_Germany_test['previous_Active'] = X_Germany_test['active_rate'] = 0
X_Germany_test.loc[0,'days_since_first'] =  X_Germany_df["days_since_first"].iloc[-1]+1
X_Germany_test.loc[0,'active_rate']=X_Germany_df["active_rate"].iloc[-1]+1
X_Germany_test.loc[0,'previous_Active'] = y[y.index == "Germany"].max()

for i in range(0,5):
    X_Germany_test.loc[i,'days_since_first'] = X_Germany_test.loc[0,'days_since_first'] + i

X_Germany_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Germany_test.iloc[i].to_numpy()]))
    X_Germany_test.loc[i+1,'previous_Active']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Germany_test['predicted_infections']=pred.iloc[:, 0] 
X_Germany_test=X_Germany_test[{'predicted_infections'}]
X_Germany_test["Actual infections"] = covid_df_kaggle_Germany["Active Cases"]
X_Germany_test["Date"] = covid_df_kaggle_Germany["ObservationDate"]
X_Germany_test = X_Germany_test.iloc[0:5]
X_Germany_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,30320,24513,2020-03-22
1,37115,28480,2020-03-23
2,43192,29586,2020-03-24
3,44120,33570,2020-03-25
4,44120,37998,2020-03-26


### Random Forest Regression - Switzerland

In [83]:
covid_df_kaggle_Switzerland = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Switzerland"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()
covid_df_kaggle_Switzerland["Active Cases"] = covid_df_kaggle_Switzerland['Confirmed']-(covid_df_kaggle_Switzerland['Recovered'] + covid_df_kaggle_Switzerland['Deaths'])
covid_df_kaggle_Switzerland

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths,Active Cases
0,2020-02-25,1,0,0,1
1,2020-02-26,1,0,0,1
2,2020-02-27,8,0,0,8
3,2020-02-28,8,0,0,8
4,2020-02-29,18,0,0,18
5,2020-03-01,27,0,0,27
6,2020-03-02,42,0,0,42
7,2020-03-03,56,2,0,54
8,2020-03-04,90,3,0,87
9,2020-03-05,114,3,1,110


In [84]:
filter_condn = (covid_df_kaggle_Switzerland['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Switzerland['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Switzerland = covid_df_kaggle_Switzerland[filter_condn].reset_index()
covid_df_kaggle_Switzerland

X_Switzerland_df = X[X.index == "Switzerland"].copy()
X_Switzerland_test = X_Switzerland_df.head().copy()
X_Switzerland_test=X_Switzerland_test.reset_index(drop=True)
X_Switzerland_test['days_since_first'] = X_Switzerland_test['previous_Active'] = X_Switzerland_test['active_rate'] = 0
X_Switzerland_test.loc[0,'days_since_first'] =  X_Switzerland_df["days_since_first"].iloc[-1]+1
X_Switzerland_test.loc[0,'active_rate']=X_Switzerland_df["active_rate"].iloc[-1]+1
X_Switzerland_test.loc[0,'previous_Active'] = y[y.index == "Switzerland"].max()

for i in range(0,5):
    X_Switzerland_test.loc[i,'days_since_first'] = X_Switzerland_test.loc[0,'days_since_first'] + i

X_Switzerland_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Switzerland_test.iloc[i].to_numpy()]))
    X_Switzerland_test.loc[i+1,'previous_Active']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Switzerland_test['predicted_infections']=pred.iloc[:, 0] 
X_Switzerland_test=X_Switzerland_test[{'predicted_infections'}]
X_Switzerland_test["Actual infections"] = covid_df_kaggle_Switzerland["Active Cases"]
X_Switzerland_test["Date"] = covid_df_kaggle_Switzerland["ObservationDate"]
X_Switzerland_test = X_Switzerland_test.iloc[0:5]
X_Switzerland_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,8573,7245,2020-03-22
1,9634,8544,2020-03-23
2,10732,9624,2020-03-24
3,11486,10613,2020-03-25
4,11629,11489,2020-03-26
