# Covid 19 - predictions for recovered cases

### Loading libraries

In [2]:
import pandas as pd
import pandas as pd 
import random

In [3]:
import math
import time
import datetime
import operator 
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

In [4]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
plt.style.use('seaborn-dark')
#print(plt.style.available)
%matplotlib inline

from sklearn import metrics

In [6]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV

In [7]:
import warnings
warnings.filterwarnings("ignore")

### Preparation for predictions

#### Data Loading

In [8]:
#dataset
covid_df= pd.read_csv("corona_pred_china.csv")
covid_df.head()

Unnamed: 0,ObservationDate,Country,Continent,Confirmed,Deaths,Recovered,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,GDP ($ per capita)
0,2020-01-22,Hong Kong,Asia,0.0,0.0,0.0,0.0,6940432.0,1092.0,63557,524,28800.0
1,2020-01-22,Japan,Asia,2.0,0.0,0.0,2.0,127463611.0,377835.0,3374,0,28200.0
2,2020-01-22,Macao,Asia,1.0,0.0,0.0,1.0,453125.0,28.0,161830,486,19400.0
3,2020-01-22,South Korea,Asia,1.0,0.0,0.0,1.0,48846823.0,98480.0,4960,0,17800.0
4,2020-01-22,Taiwan,Asia,1.0,0.0,0.0,1.0,23036087.0,35980.0,6403,0,23400.0


#### Data transformations

In [9]:
#Converting "Observation Date" into Datetime format
covid_df["ObservationDate"]=pd.to_datetime(covid_df["ObservationDate"])
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3811 entries, 0 to 3810
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   ObservationDate             3811 non-null   datetime64[ns]
 1   Country                     3811 non-null   object        
 2   Continent                   3811 non-null   object        
 3   Confirmed                   3811 non-null   float64       
 4   Deaths                      3811 non-null   float64       
 5   Recovered                   3811 non-null   float64       
 6   Active Cases                3811 non-null   float64       
 7   Population                  3811 non-null   float64       
 8   Area (sq. mi.)              3811 non-null   float64       
 9   Pop. Density (per sq. mi.)  3811 non-null   object        
 10  Net migration               3811 non-null   object        
 11  GDP ($ per capita)          3811 non-null   float64     

In [10]:
# Changing datatype of Population density and Net Migration columns as float
cols=["Pop. Density (per sq. mi.)","Net migration"]                             
for i in cols:
    covid_df[i]=covid_df[i].str.replace(",","").astype(float)
covid_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3811 entries, 0 to 3810
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   ObservationDate             3811 non-null   datetime64[ns]
 1   Country                     3811 non-null   object        
 2   Continent                   3811 non-null   object        
 3   Confirmed                   3811 non-null   float64       
 4   Deaths                      3811 non-null   float64       
 5   Recovered                   3811 non-null   float64       
 6   Active Cases                3811 non-null   float64       
 7   Population                  3811 non-null   float64       
 8   Area (sq. mi.)              3811 non-null   float64       
 9   Pop. Density (per sq. mi.)  3811 non-null   float64       
 10  Net migration               3811 non-null   float64       
 11  GDP ($ per capita)          3811 non-null   float64     

In [11]:
covid_df.head()

Unnamed: 0,ObservationDate,Country,Continent,Confirmed,Deaths,Recovered,Active Cases,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,GDP ($ per capita)
0,2020-01-22,Hong Kong,Asia,0.0,0.0,0.0,0.0,6940432.0,1092.0,63557.0,524.0,28800.0
1,2020-01-22,Japan,Asia,2.0,0.0,0.0,2.0,127463611.0,377835.0,3374.0,0.0,28200.0
2,2020-01-22,Macao,Asia,1.0,0.0,0.0,1.0,453125.0,28.0,161830.0,486.0,19400.0
3,2020-01-22,South Korea,Asia,1.0,0.0,0.0,1.0,48846823.0,98480.0,4960.0,0.0,17800.0
4,2020-01-22,Taiwan,Asia,1.0,0.0,0.0,1.0,23036087.0,35980.0,6403.0,0.0,23400.0


In [12]:
# Sorting based on the country names
covid_df = covid_df.sort_values(["Country", "ObservationDate"]).reset_index(drop=True)

In [13]:
covid_df.shape

(3811, 12)

In [14]:
# Calculating days since the first occurance for each country
covid_df['days_since_first']=0
covid_df['previous_Recovered'] = 0
covid_df['recovery_rate'] = 0
previous_Recovered=0
confirm_difference=0

for i in covid_df.index[1:]:
    if covid_df['Country'][i]==covid_df['Country'][i-1]:
        covid_df.loc[i,'days_since_first']=(covid_df.at[i-1,'days_since_first']+1)
        previous_Recovered=covid_df.at[i-1,'Recovered']
        confirm_difference= covid_df.at[i,'Recovered']-previous_Recovered
        covid_df.loc[i,'previous_Recovered']=previous_Recovered        
        if ((previous_Recovered!=0) & (confirm_difference>0) ):
            covid_df.loc[i,'recovery_rate']=((confirm_difference/previous_Recovered))       
        else:
            covid_df.loc[i,'recovery_rate']=0
    else:
        covid_df.loc[i,'days_since_first'] = 0
        covid_df.loc[i,'previous_Recovered'] = 0
        covid_df.loc[i,'recovery_rate'] = 0  

In [15]:
# Ensuring that previous Recovered column type is int
covid_df['previous_Recovered'].astype(int)
covid_df=covid_df[covid_df['recovery_rate']>=0]

In [16]:
covid_df.set_index('Country',inplace=True)

#### Check for highest number of Recovered cases in Europe

In [17]:
covid_df_Europe = covid_df[covid_df['Continent'] == "Europe"].groupby('Country').agg(
                {
                     'Recovered':sum,    # Sum duration per group
                     'days_since_first': max,  # get the max of days
                }).sort_values(["Recovered", "days_since_first"]).tail()
covid_df_Europe.sort_values(["Recovered"], ascending=False)

Unnamed: 0_level_0,Recovered,days_since_first
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Italy,43930.0,51
Spain,11793.0,50
France,2554.0,58
Germany,1562.0,54
United Kingdom,740.0,51


In [18]:
covid_df.columns

Index(['ObservationDate', 'Continent', 'Confirmed', 'Deaths', 'Recovered',
       'Active Cases', 'Population', 'Area (sq. mi.)',
       'Pop. Density (per sq. mi.)', 'Net migration', 'GDP ($ per capita)',
       'days_since_first', 'previous_Recovered', 'recovery_rate'],
      dtype='object')

### Random Forest Regressor

In [20]:
X=covid_df.copy()
del X['Continent']
del X['ObservationDate']
del X['Confirmed']
del X['Deaths']
del X['Active Cases']
del X['GDP ($ per capita)']
#del X['previous_Activecases']
#del X['previous_recovery']
#del X['previous_Recovered']



In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3811 entries, Afghanistan to Zimbabwe
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Recovered                   3811 non-null   float64
 1   Population                  3811 non-null   float64
 2   Area (sq. mi.)              3811 non-null   float64
 3   Pop. Density (per sq. mi.)  3811 non-null   float64
 4   Net migration               3811 non-null   float64
 5   days_since_first            3811 non-null   int64  
 6   previous_Recovered          3811 non-null   float64
 7   recovery_rate               3811 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 268.0+ KB


In [22]:
X["Net migration"].fillna(0, inplace=True)
X["Pop. Density (per sq. mi.)"].fillna(0, inplace=True)

In [23]:
X.isnull().any()

Recovered                     False
Population                    False
Area (sq. mi.)                False
Pop. Density (per sq. mi.)    False
Net migration                 False
days_since_first              False
previous_Recovered            False
recovery_rate                 False
dtype: bool

In [24]:
corr = X.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Recovered,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,days_since_first,previous_Recovered,recovery_rate
Recovered,1.0,0.01,-0.0,-0.02,-0.01,0.16,0.99,0.06
Population,0.01,1.0,0.3,-0.07,-0.06,0.15,0.01,-0.0
Area (sq. mi.),-0.0,0.3,1.0,-0.1,0.08,0.16,-0.0,-0.01
Pop. Density (per sq. mi.),-0.02,-0.07,-0.1,1.0,0.29,0.14,-0.02,-0.01
Net migration,-0.01,-0.06,0.08,0.29,1.0,0.13,-0.01,0.0
days_since_first,0.16,0.15,0.16,0.14,0.13,1.0,0.15,0.07
previous_Recovered,0.99,0.01,-0.0,-0.02,-0.01,0.15,1.0,-0.0
recovery_rate,0.06,-0.0,-0.01,-0.01,0.0,0.07,-0.0,1.0


In [25]:
# Pick only features that are correlated more than 0.2
y = X['Recovered']     

In [26]:
print(X.shape)
X.head()

(3811, 8)


Unnamed: 0_level_0,Recovered,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Net migration,days_since_first,previous_Recovered,recovery_rate
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,0.0,31056997.0,647500.0,480.0,2306.0,0,0.0,0.0
Afghanistan,0.0,31056997.0,647500.0,480.0,2306.0,1,0.0,0.0
Afghanistan,0.0,31056997.0,647500.0,480.0,2306.0,2,0.0,0.0
Afghanistan,0.0,31056997.0,647500.0,480.0,2306.0,3,0.0,0.0
Afghanistan,0.0,31056997.0,647500.0,480.0,2306.0,4,0.0,0.0


In [27]:
# To get rid of +e format
pd.options.display.float_format = '{:20,.0f}'.format

#### Train Test Split

In [28]:
X.drop(['Recovered'], axis=1, inplace=True)


# Break off validation set from training data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, test_size=0.15, random_state=1)

In [29]:
print(f"X_train shape is {X_train.shape}")
print(f"X_test shape is {X_test.shape}")
# print(f"X_validate shape is {X_validate.shape}")
print(f"y_train is {y_train.shape}")
# print(f"y_validate shape is {y_validate.shape}")
print(f"y_test shape is {y_test.shape}")

X_train shape is (3239, 7)
X_test shape is (572, 7)
y_train is (3239,)
y_test shape is (572,)


## Random Forest

#### Defining the model and checking the best score

In [30]:
#DEFINE YOUR REGRESSOR and THE PARAMETERS GRID

regressor = RandomForestRegressor()
parameters = {"criterion": ['mse'], 
              "n_estimators":[10, 100, 400], 
              "min_samples_leaf": [0.1, 1, 5, 10, 20, 50], 
              "random_state" : [1]
            #  "max_features": "auto"
            # "oob_score ": TRUE
             }

In [31]:
#DEFINE YOUR GRIDSEARCH 
gs = GridSearchCV(regressor, parameters, cv=3) #with no params it reduces to a CV

gs = gs.fit(X_train.to_numpy(),y_train.to_numpy())

In [32]:
#summarize the results of your GRIDSEARCH
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']


***GRIDSEARCH RESULTS***
Best score: 0.963660 using {'criterion': 'mse', 'min_samples_leaf': 1, 'n_estimators': 100, 'random_state': 1}


In [33]:
columns = np.asarray(X.columns)
#columns = np.asarray([columns])
from sklearn import tree
r = tree.export_text(gs.best_estimator_.estimators_[0],feature_names=columns.tolist(), max_depth=3)
print(r)

|--- previous_Recovered <= 2045.50
|   |--- previous_Recovered <= 526.50
|   |   |--- previous_Recovered <= 192.00
|   |   |   |--- recovery_rate <= 96.67
|   |   |   |   |--- truncated branch of depth 20
|   |   |   |--- recovery_rate >  96.67
|   |   |   |   |--- truncated branch of depth 2
|   |   |--- previous_Recovered >  192.00
|   |   |   |--- previous_Recovered <= 312.00
|   |   |   |   |--- truncated branch of depth 5
|   |   |   |--- previous_Recovered >  312.00
|   |   |   |   |--- truncated branch of depth 4
|   |--- previous_Recovered >  526.50
|   |   |--- previous_Recovered <= 1094.00
|   |   |   |--- days_since_first <= 39.50
|   |   |   |   |--- truncated branch of depth 3
|   |   |   |--- days_since_first >  39.50
|   |   |   |   |--- truncated branch of depth 3
|   |   |--- previous_Recovered >  1094.00
|   |   |   |--- recovery_rate <= 0.14
|   |   |   |   |--- value: [1540.00]
|   |   |   |--- recovery_rate >  0.14
|   |   |   |   |--- truncated branch of depth 6
|

####  Training the model

In [34]:
#we establish the model with the best estimator according to Gridsearch results
gs = gs.best_estimator_ 

gs = gs.fit(X_train.to_numpy(),y_train.to_numpy())

In [35]:
#Returns the coefficient of determination R^2 of the prediction.
#Explained variance score: 1 is perfect prediction
gs.score(X_test.to_numpy(), y_test.to_numpy())

0.9932844956164258

#### Testing the model

In [36]:
y_pred = gs.predict(X_test.to_numpy())

In [37]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred}).reset_index(drop=True)
df

Unnamed: 0,Actual,Predicted
0,3,3
1,0,0
2,1,1
3,1,1
4,0,0
...,...,...
567,0,0
568,77,73
569,0,0
570,1,1


In [38]:
# It is not giving correct results yet. Need to check
df=df[df["Actual"] != df["Predicted"]]
df

Unnamed: 0,Actual,Predicted
4,0,0
5,16,16
8,6,6
9,96,96
11,13,0
...,...,...
561,23,23
563,0,0
564,24,24
566,6,6


#### Evaluation results

In [39]:
print("MAE train: ", metrics.mean_absolute_error(y_train.to_numpy(), gs.predict(X_train.to_numpy()))) 
print("MSE train: ",metrics.mean_squared_error(y_train.to_numpy(), gs.predict(X_train.to_numpy())))
print("RMSE train: ",np.sqrt(metrics.mean_squared_error(y_train.to_numpy(), gs.predict(X_train.to_numpy()))))
print("r2: ",np.sqrt(metrics.r2_score(y_train.to_numpy(), gs.predict(X_train.to_numpy()))))

print("MAE test: ", metrics.mean_absolute_error(y_test.to_numpy(), gs.predict(X_test.to_numpy()))) 
print("MSE test: ",metrics.mean_squared_error(y_test.to_numpy(), gs.predict(X_test.to_numpy())))
print("RMSE test: ",np.sqrt(metrics.mean_squared_error(y_test.to_numpy(), gs.predict(X_test.to_numpy()))))
print("r2: ",np.sqrt(metrics.r2_score(y_test.to_numpy(), gs.predict(X_test.to_numpy()))))

MAE train:  2.5076597715344238
MSE train:  797.375067150355
RMSE train:  28.23783042569586
r2:  0.9975158389126167
MAE test:  2.648618881118882
MSE test:  585.175713811189
RMSE test:  24.190405408161084
r2:  0.9966365915500122


## Random Forest Regression -  For different countries

### Using Kaggle dataset to cross check the results of future prediction 

In [40]:
covid_df_kaggle= pd.read_csv("covid_19_data_Kaggle.csv")

In [41]:
covid_df_kaggle["ObservationDate"]=pd.to_datetime(covid_df_kaggle["ObservationDate"])

In [42]:
covid_df_kaggle.shape

(13850, 8)

### Random Forest Regression - Italy

In [43]:
covid_df_kaggle_Italy = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Italy"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [44]:
covid_df_kaggle_Italy

Unnamed: 0,ObservationDate,Confirmed,Recovered,Deaths
0,2020-01-31,2,0,0
1,2020-02-01,2,0,0
2,2020-02-02,2,0,0
3,2020-02-03,2,0,0
4,2020-02-04,2,0,0
...,...,...,...,...
66,2020-04-06,132547,22837,16523
67,2020-04-07,135586,24392,17127
68,2020-04-08,139422,26491,17669
69,2020-04-09,143626,28470,18279


In [45]:
filter_condn = (covid_df_kaggle_Italy['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Italy['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Italy = covid_df_kaggle_Italy[filter_condn].reset_index()
covid_df_kaggle_Italy

X_Italy_df = X[X.index == "Italy"].copy()
X_Italy_test = X_Italy_df.head().copy()
X_Italy_test=X_Italy_test.reset_index(drop=True)
X_Italy_test['days_since_first'] = X_Italy_test['previous_Recovered'] = X_Italy_test['recovery_rate'] = 0
X_Italy_test.loc[0,'days_since_first'] =  X_Italy_df["days_since_first"].iloc[-1]+1
X_Italy_test.loc[0,'recovery_rate']=X_Italy_df["recovery_rate"].iloc[-1]+1
X_Italy_test.loc[0,'previous_Recovered'] = y[y.index == "Italy"].max()

for i in range(0,5):
    X_Italy_test.loc[i,'days_since_first'] = X_Italy_test.loc[0,'days_since_first'] + i

X_Italy_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Italy_test.iloc[i].to_numpy()]))
    X_Italy_test.loc[i+1,'previous_Recovered']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Italy_test['predicted_infections']=pred.iloc[:, 0] 
X_Italy_test=X_Italy_test[{'predicted_infections'}]
# Creating new column to have actual predictions and date from Kaggle dataset
X_Italy_test["Actual infections"] = covid_df_kaggle_Italy["Recovered"]
X_Italy_test["Date"] = covid_df_kaggle_Italy["ObservationDate"]
X_Italy_test = X_Italy_test.iloc[0:5]
X_Italy_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,7297,7024,2020-03-22
1,7247,7432,2020-03-23
2,7247,8326,2020-03-24
3,7247,9362,2020-03-25
4,7247,10361,2020-03-26


### Random Forest Regression - France

In [46]:
covid_df_kaggle_France = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "France"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [47]:
filter_condn = (covid_df_kaggle_France['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_France['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_France = covid_df_kaggle_France[filter_condn].reset_index()
covid_df_kaggle_France

X_France_df = X[X.index == "France"].copy()
X_France_test = X_France_df.head().copy()
X_France_test=X_France_test.reset_index(drop=True)
X_France_test['days_since_first'] = X_France_test['previous_Recovered'] = X_France_test['recovery_rate'] = 0
X_France_test.loc[0,'days_since_first'] =  X_France_df["days_since_first"].iloc[-1]+1
X_France_test.loc[0,'recovery_rate']=X_France_df["recovery_rate"].iloc[-1]+1
X_France_test.loc[0,'previous_Recovered'] = y[y.index == "France"].max()

for i in range(0,5):
    X_France_test.loc[i,'days_since_first'] = X_France_test.loc[0,'days_since_first'] + i

X_France_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_France_test.iloc[i].to_numpy()]))
    X_France_test.loc[i+1,'previous_Recovered']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_France_test['predicted_infections']=pred.iloc[:, 0] 
X_France_test=X_France_test[{'predicted_infections'}]
X_France_test["Actual infections"] = covid_df_kaggle_France["Recovered"]
X_France_test["Date"] = covid_df_kaggle_France["ObservationDate"]
X_France_test = X_France_test.iloc[0:5]
X_France_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,2994,2201,2020-03-22
1,3546,2207,2020-03-23
2,4136,3288,2020-03-24
3,4539,3907,2020-03-25
4,5010,4955,2020-03-26


### Random Forest Regression - Spain

In [48]:
covid_df_kaggle_Spain = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Spain"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [49]:
filter_condn = (covid_df_kaggle_Spain['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Spain['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Spain = covid_df_kaggle_Spain[filter_condn].reset_index()
covid_df_kaggle_Spain

X_Spain_df = X[X.index == "Spain"].copy()
X_Spain_test = X_Spain_df.head().copy()
X_Spain_test=X_Spain_test.reset_index(drop=True)
X_Spain_test['days_since_first'] = X_Spain_test['previous_Recovered'] = X_Spain_test['recovery_rate'] = 0
X_Spain_test.loc[0,'days_since_first'] =  X_Spain_df["days_since_first"].iloc[-1]+1
X_Spain_test.loc[0,'recovery_rate']=X_Spain_df["recovery_rate"].iloc[-1]+1
X_Spain_test.loc[0,'previous_Recovered'] = y[y.index == "Spain"].max()

for i in range(0,5):
    X_Spain_test.loc[i,'days_since_first'] = X_Spain_test.loc[0,'days_since_first'] + i

X_Spain_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Spain_test.iloc[i].to_numpy()]))
    X_Spain_test.loc[i+1,'previous_Recovered']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Spain_test['predicted_infections']=pred.iloc[:, 0] 
X_Spain_test=X_Spain_test[{'predicted_infections'}]
X_Spain_test["Actual infections"] = covid_df_kaggle_Spain["Recovered"]
X_Spain_test["Date"] = covid_df_kaggle_Spain["ObservationDate"]
X_Spain_test = X_Spain_test.iloc[0:5]
X_Spain_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,3704,2125,2020-03-22
1,4527,3355,2020-03-23
2,5010,3794,2020-03-24
3,5619,5367,2020-03-25
4,6568,7015,2020-03-26


### Random Forest Regression - Germany

In [50]:
covid_df_kaggle_Germany = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Germany"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [51]:
filter_condn = (covid_df_kaggle_Germany['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Germany['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Germany = covid_df_kaggle_Germany[filter_condn].reset_index()
covid_df_kaggle_Germany

X_Germany_df = X[X.index == "Germany"].copy()
X_Germany_test = X_Germany_df.head().copy()
X_Germany_test=X_Germany_test.reset_index(drop=True)
X_Germany_test['days_since_first'] = X_Germany_test['previous_Recovered'] = X_Germany_test['recovery_rate'] = 0
X_Germany_test.loc[0,'days_since_first'] =  X_Germany_df["days_since_first"].iloc[-1]+1
X_Germany_test.loc[0,'recovery_rate']=X_Germany_df["recovery_rate"].iloc[-1]+1
X_Germany_test.loc[0,'previous_Recovered'] = y[y.index == "Germany"].max()

for i in range(0,5):
    X_Germany_test.loc[i,'days_since_first'] = X_Germany_test.loc[0,'days_since_first'] + i

X_Germany_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Germany_test.iloc[i].to_numpy()]))
    X_Germany_test.loc[i+1,'previous_Recovered']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Germany_test['predicted_infections']=pred.iloc[:, 0] 
X_Germany_test=X_Germany_test[{'predicted_infections'}]
X_Germany_test["Actual infections"] = covid_df_kaggle_Germany["Recovered"]
X_Germany_test["Date"] = covid_df_kaggle_Germany["ObservationDate"]
X_Germany_test = X_Germany_test.iloc[0:5]
X_Germany_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,435,266,2020-03-22
1,635,453,2020-03-23
2,799,3243,2020-03-24
3,900,3547,2020-03-25
4,1176,5673,2020-03-26


### Random Forest Regression - Switzerland

In [52]:
covid_df_kaggle_Switzerland = covid_df_kaggle[covid_df_kaggle["Country/Region"] == "Switzerland"].groupby("ObservationDate").agg(
                {
                     'Confirmed':sum,    # Sum duration per group
                     'Recovered':sum,    # Sum duration per group
                     'Deaths':sum    # Sum duration per group
                }).sort_values(["ObservationDate"]).reset_index()

In [53]:
filter_condn = (covid_df_kaggle_Switzerland['ObservationDate'] > '2020-03-21') & (covid_df_kaggle_Switzerland['ObservationDate'] <= '2020-03-26')
covid_df_kaggle_Switzerland = covid_df_kaggle_Switzerland[filter_condn].reset_index()
covid_df_kaggle_Switzerland

X_Switzerland_df = X[X.index == "Switzerland"].copy()
X_Switzerland_test = X_Switzerland_df.head().copy()
X_Switzerland_test=X_Switzerland_test.reset_index(drop=True)
X_Switzerland_test['days_since_first'] = X_Switzerland_test['previous_Recovered'] = X_Switzerland_test['recovery_rate'] = 0
X_Switzerland_test.loc[0,'days_since_first'] =  X_Switzerland_df["days_since_first"].iloc[-1]+1
X_Switzerland_test.loc[0,'recovery_rate']=X_Switzerland_df["recovery_rate"].iloc[-1]+1
X_Switzerland_test.loc[0,'previous_Recovered'] = y[y.index == "Switzerland"].max()

for i in range(0,5):
    X_Switzerland_test.loc[i,'days_since_first'] = X_Switzerland_test.loc[0,'days_since_first'] + i

X_Switzerland_test

predictions=[]

for i in range(0,5):
    pred=int(gs.predict([X_Switzerland_test.iloc[i].to_numpy()]))
    X_Switzerland_test.loc[i+1,'previous_Recovered']=pred    
    predictions.append(pred)

pred = pd.DataFrame(predictions)

#### Prediction results
X_Switzerland_test['predicted_infections']=pred.iloc[:, 0] 
X_Switzerland_test=X_Switzerland_test[{'predicted_infections'}]
X_Switzerland_test["Actual infections"] = covid_df_kaggle_Switzerland["Recovered"]
X_Switzerland_test["Date"] = covid_df_kaggle_Switzerland["ObservationDate"]
X_Switzerland_test = X_Switzerland_test.iloc[0:5]
X_Switzerland_test 

Unnamed: 0,predicted_infections,Actual infections,Date
0,221,131,2020-03-22
1,272,131,2020-03-23
2,318,131,2020-03-24
3,429,131,2020-03-25
4,518,131,2020-03-26
