In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [165]:
df = pd.read_csv('covid_19_data.csv',parse_dates=['Last Update'])
df.tail()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
156287,156288,11/15/2020,Zaporizhia Oblast,Ukraine,2020-11-16 05:25:57,18484.0,164.0,3021.0
156288,156289,11/15/2020,Zeeland,Netherlands,2020-11-16 05:25:57,5041.0,86.0,0.0
156289,156290,11/15/2020,Zhejiang,Mainland China,2020-11-16 05:25:57,1291.0,1.0,1279.0
156290,156291,11/15/2020,Zhytomyr Oblast,Ukraine,2020-11-16 05:25:57,22225.0,368.0,12266.0
156291,156292,11/15/2020,Zuid-Holland,Netherlands,2020-11-16 05:25:57,129188.0,2031.0,0.0


In [166]:

df.rename(columns={'Last Update':'Date', 'Country/Region':'Country',
                              }, inplace=True)
df2=df[df['Date']>='2020-04-01']
df3=df2[df2['Date']<'2020-11-11']
df3.describe()

Unnamed: 0,SNo,Confirmed,Deaths,Recovered
count,140354.0,140354.0,140354.0,140354.0
mean,80846.5,28969.33,1008.251635,18243.5
std,40516.854178,86277.97,3321.954212,96422.34
min,10670.0,-302844.0,-178.0,-854405.0
25%,45758.25,599.0,9.0,25.0
50%,80846.5,3998.5,81.0,991.0
75%,115934.75,16961.5,542.0,6731.0
max,151023.0,1790817.0,45240.0,3881491.0


In [167]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140354 entries, 10669 to 151022
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   SNo              140354 non-null  int64         
 1   ObservationDate  140354 non-null  object        
 2   Province/State   102189 non-null  object        
 3   Country          140354 non-null  object        
 4   Date             140354 non-null  datetime64[ns]
 5   Confirmed        140354 non-null  float64       
 6   Deaths           140354 non-null  float64       
 7   Recovered        140354 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(3)
memory usage: 9.6+ MB


In [168]:
#Transforming Data for Forecasting
#The input to Prophet is always a dataframe with two columns: ds and y. 
#The ds (datestamp) column should be of a format expected by Pandas, ideally YYYY-MM-DD for a date The y column must be numeric
confirmed = df3.groupby('Date').sum()['Confirmed'].reset_index()
deaths = df3.groupby('Date').sum()['Deaths'].reset_index()
recovered = df3.groupby('Date').sum()['Recovered'].reset_index()
deaths

Unnamed: 0,Date,Deaths
0,2020-04-01 22:04:58,46809.0
1,2020-04-02 08:53:00,52983.0
2,2020-04-03 22:52:45,58787.0
3,2020-04-04 09:38:00,64606.0
4,2020-04-05 23:13:44,69374.0
...,...,...
217,2020-11-05 06:05:03,1225889.0
218,2020-11-06 05:24:55,1233965.0
219,2020-11-07 05:24:52,1243528.0
220,2020-11-08 05:24:56,1251104.0


In [169]:
# X=features
# y= target variable what we are trying to forecast

X=df3[['Confirmed','Recovered']]
y=df3['Deaths']



## Training and Predicting


In [170]:
from sklearn.model_selection import train_test_split

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


In [172]:
from sklearn.linear_model import LinearRegression


In [173]:
lm = LinearRegression()

In [174]:
lm.fit(X_train,y_train)

LinearRegression()

In [175]:
# The coefficients
print('Coefficients: \n', lm.coef_)

Coefficients: 
 [ 0.03104805 -0.00227139]


In [176]:
coeffecients = pd.DataFrame(lm.coef_,X.columns)
coeffecients.columns = ['Coeffecient']
coeffecients

Unnamed: 0,Coeffecient
Confirmed,0.031048
Recovered,-0.002271


## Conclucion - Interpreting the coefficients:

A 1 unit increase in Confirmed cases is associated with an increase of 0.031 total Deaths.


A 1 unit increase in Recovered is associated with a decreases of -0.002 total Deaths.

In [252]:
#Transforming Data for Forecasting
#The input to Prophet is always a dataframe with two columns: ds and y. 
#The ds (datestamp) column should be of a format expected by Pandas, ideally YYYY-MM-DD for a date The y column must be numeric
confirmed = df3.groupby('Date').sum()['Confirmed'].reset_index()
deaths = df3.groupby('Date').sum()['Deaths'].reset_index()
recovered = df3.groupby('Date').sum()['Recovered'].reset_index()
deaths1=deaths[deaths['Date']<'2020-11-01']
deaths2=deaths[deaths['Date']>'2020-11-01']
deaths_df1 = deaths1.set_index('Date','D' )
deaths_df2 = deaths2.set_index('Date','D')
recovered_df = recovered.set_index('Date','D')
confirmed_df = confirmed.set_index('Date','D')
deaths_df1

Unnamed: 0_level_0,Deaths
Date,Unnamed: 1_level_1
2020-04-01 22:04:58,46809.0
2020-04-02 08:53:00,52983.0
2020-04-03 22:52:45,58787.0
2020-04-04 09:38:00,64606.0
2020-04-05 23:13:44,69374.0
...,...
2020-10-27 04:24:45,1159720.0
2020-10-28 04:24:39,1167129.0
2020-10-29 04:24:49,1174251.0
2020-10-30 04:24:49,1181281.0


In [253]:
train_data = deaths_df1
test_data = deaths_df2
train_data


Unnamed: 0_level_0,Deaths
Date,Unnamed: 1_level_1
2020-04-01 22:04:58,46809.0
2020-04-02 08:53:00,52983.0
2020-04-03 22:52:45,58787.0
2020-04-04 09:38:00,64606.0
2020-04-05 23:13:44,69374.0
...,...
2020-10-27 04:24:45,1159720.0
2020-10-28 04:24:39,1167129.0
2020-10-29 04:24:49,1174251.0
2020-10-30 04:24:49,1181281.0


In [254]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [255]:
Fitted_model = ExponentialSmoothing(train_data['Deaths']
                                   ,trend='add'
                                   ,seasonal='add'
                                   ,seasonal_periods = 6).fit()



In [256]:
test_predictions = Fitted_model.forecast(12)



In [260]:
test_predictions

213    1.192035e+06
214    1.198704e+06
215    1.205718e+06
216    1.211542e+06
217    1.217203e+06
218    1.220664e+06
219    1.225752e+06
220    1.232421e+06
221    1.239435e+06
222    1.245259e+06
223    1.250920e+06
224    1.254381e+06
dtype: float64