# MACHINELEARNING

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Terms to Know
- **R-squared**: the r2(2=squared) value represents how much of the variance in the data can be explained by the model. The higher the value, the better the model fits. 
- **Adjusted R-squared**: The adjusted r2 value is used to correct some of the problems with raw r2 score. Adjusted r2 accounts for the number of variables with a dataset, only increasing the r2 score if the variable aids in prediction to a degree greater than what would occur simply by chance. The adjusted r2 value can be used as a more accurate way to determine the fit of a model. 
- **RMSE**: RMSE stands for root-mean-square deviation. In statistical terms, RMSE is the standard deviation of the residuals, residuals meausring the distance between data points and teh regression line. RMSE tells you how spread the data points are around the regression line; how well the data fits to the model. Lower values for RMSE mean a better fit because ethe data is *less* far from the model.

## Contents
1. [DUMMY_VARIABLES](#1.-DUMMY_VARIABLES)
2. [COVID](#2.-COVID)
3. [H1N1](#3.-H1N1)
4. [Both](#4.-Both)

## 1. DUMMY_VARIABLES

In [2]:
df = pd.read_csv("for_ml.csv")

In [3]:
df.head()

Unnamed: 0,date,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl
0,2000-01-31,280730000.0,0.665408,1.511417,0.631568,0.0,0.0,0.923307,0.0,0.0,3.562918,0.001203
1,2000-02-29,280940000.0,0.58411,1.762654,0.590162,0.0,0.0,0.832206,0.0,0.0,3.560255,0.001202
2,2000-03-31,281160000.0,0.578318,2.136862,0.609973,0.0,0.0,0.896287,0.0,0.0,3.557469,0.001201
3,2000-04-30,281420000.0,0.55291,1.902139,0.627176,0.0,0.0,0.882311,0.0,0.0,3.641433,0.0012
4,2000-05-31,281640000.0,0.56384,2.284832,0.64657,0.0,0.0,0.930976,0.0,0.0,3.638588,0.001199


In [4]:
df = df.dropna()

In [5]:
print(df.shape)
df.head()

(252, 12)


Unnamed: 0,date,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl
0,2000-01-31,280730000.0,0.665408,1.511417,0.631568,0.0,0.0,0.923307,0.0,0.0,3.562918,0.001203
1,2000-02-29,280940000.0,0.58411,1.762654,0.590162,0.0,0.0,0.832206,0.0,0.0,3.560255,0.001202
2,2000-03-31,281160000.0,0.578318,2.136862,0.609973,0.0,0.0,0.896287,0.0,0.0,3.557469,0.001201
3,2000-04-30,281420000.0,0.55291,1.902139,0.627176,0.0,0.0,0.882311,0.0,0.0,3.641433,0.0012
4,2000-05-31,281640000.0,0.56384,2.284832,0.64657,0.0,0.0,0.930976,0.0,0.0,3.638588,0.001199


In [6]:
#df = df.set_index(df.date)
#df = df.drop(columns= ['date'])
df.set_index('date',inplace=True)
df.head()

Unnamed: 0_level_0,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-01-31,280730000.0,0.665408,1.511417,0.631568,0.0,0.0,0.923307,0.0,0.0,3.562918,0.001203
2000-02-29,280940000.0,0.58411,1.762654,0.590162,0.0,0.0,0.832206,0.0,0.0,3.560255,0.001202
2000-03-31,281160000.0,0.578318,2.136862,0.609973,0.0,0.0,0.896287,0.0,0.0,3.557469,0.001201
2000-04-30,281420000.0,0.55291,1.902139,0.627176,0.0,0.0,0.882311,0.0,0.0,3.641433,0.0012
2000-05-31,281640000.0,0.56384,2.284832,0.64657,0.0,0.0,0.930976,0.0,0.0,3.638588,0.001199


In [7]:
df['h1n1_dum'] = 1 *(df['h1n1_deaths'] != 0)
df['covid_dum'] = 1 *(df['covid_deaths'] != 0)
df['pandemic'] = 1* ((df['h1n1_deaths'] != 0) | (df['covid_deaths'] != 0)  )

In [8]:
display(df.shape)
df.head()

(252, 14)

Unnamed: 0_level_0,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl,h1n1_dum,covid_dum,pandemic
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2000-01-31,280730000.0,0.665408,1.511417,0.631568,0.0,0.0,0.923307,0.0,0.0,3.562918,0.001203,0,0,0
2000-02-29,280940000.0,0.58411,1.762654,0.590162,0.0,0.0,0.832206,0.0,0.0,3.560255,0.001202,0,0,0
2000-03-31,281160000.0,0.578318,2.136862,0.609973,0.0,0.0,0.896287,0.0,0.0,3.557469,0.001201,0,0,0
2000-04-30,281420000.0,0.55291,1.902139,0.627176,0.0,0.0,0.882311,0.0,0.0,3.641433,0.0012,0,0,0
2000-05-31,281640000.0,0.56384,2.284832,0.64657,0.0,0.0,0.930976,0.0,0.0,3.638588,0.001199,0,0,0


In [9]:
df.loc[df['covid_dum']==1]

Unnamed: 0_level_0,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl,h1n1_dum,covid_dum,pandemic
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-02-29,329240000.0,0.983477,3.422124,2.032256,0.443749,3.5,1.109221,0.0,0.000304,6.524531,0.000375,0,1,1
2020-03-31,329340000.0,1.080646,4.062974,2.294286,0.511933,4.4,1.199976,0.0,1.306856,6.52255,0.000375,0,1,1
2020-04-30,331450000.0,1.156132,3.690451,2.493287,0.525569,14.7,1.04782,0.0,19.04963,5.876435,0.000372,0,1,1
2020-05-31,331420000.0,1.221713,4.223342,2.94068,0.62187,13.2,1.133305,0.0,31.494478,5.876967,0.000372,0,1,1
2020-06-30,331450000.0,1.27259,4.861065,2.562377,0.677327,11.0,1.19475,0.0,38.455876,5.876435,0.000372,0,1,1
2020-07-31,331500000.0,1.360181,4.726998,2.662142,0.726395,10.2,1.257315,0.0,46.413876,6.376644,0.000372,0,1,1
2020-08-31,331560000.0,1.315599,4.608819,2.591688,0.701532,8.4,1.218482,0.0,55.335987,6.37549,0.000372,0,1,1
2020-09-30,331630000.0,1.274312,4.675693,2.347797,0.651931,7.9,1.178422,0.0,62.374333,6.374144,0.000372,0,1,1
2020-10-31,331700000.0,1.314139,4.880314,2.325294,0.707869,6.9,1.139885,0.0,69.494121,6.475007,0.000372,0,1,1
2020-11-30,331750000.0,1.334439,4.5052,2.305652,0.669781,6.7,1.116503,0.0,80.793067,6.474031,0.000372,0,1,1


# Linear Regression
<hr>

In [10]:
df.describe()

Unnamed: 0,us_pop,alcohol_deaths,alcohol_sales,drug_deaths,homicides,perc_unemp,suicide_deaths,h1n1_deaths,covid_deaths,gdp_period,yearly_infl,h1n1_dum,covid_dum,pandemic
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,307897700.0,0.748492,3.089993,1.239032,0.341695,5.480556,1.038086,0.10487,2.019956,5.008284,0.0007,0.051587,0.043651,0.095238
std,14953640.0,0.174769,0.72188,0.46697,0.22351,2.715086,0.130518,0.602262,11.49253,0.864568,0.000358,0.221633,0.204724,0.294128
min,280730000.0,0.532032,1.511417,0.505372,0.0,0.0,0.784293,0.0,0.0,3.557469,-0.000116,0.0,0.0,0.0
25%,294725000.0,0.613701,2.485569,0.923325,0.0,4.4,0.930433,0.0,0.0,4.367333,0.000509,0.0,0.0,0.0
50%,309220000.0,0.698647,3.062284,1.115141,0.44368,5.4,1.018404,0.0,0.0,4.895671,0.00066,0.0,0.0,0.0
75%,321137500.0,0.865582,3.59222,1.480669,0.497302,7.2,1.132105,0.0,0.0,5.702889,0.000994,0.0,0.0,0.0
max,331750000.0,1.399548,4.923888,2.94068,0.726395,14.7,1.340765,4.038412,104.310475,6.599275,0.001268,1.0,1.0,1.0


## 2. COVID

In [11]:
X=df[['alcohol_deaths','alcohol_sales','drug_deaths','homicides','suicide_deaths','perc_unemp','gdp_period','yearly_infl']]
y=df[['covid_dum']]

In [12]:
from sklearn.model_selection import train_test_split

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression()

### Model

In [14]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for alcohol_deaths is 1.0618377532912666
The coefficient for alcohol_sales is 0.041508025976854
The coefficient for drug_deaths is 0.3667744706201286
The coefficient for homicides is 0.1262982903757128
The coefficient for suicide_deaths is -0.3285949267729343
The coefficient for perc_unemp is 0.013896378846105046
The coefficient for gdp_period is -0.3264889945438731
The coefficient for yearly_infl is 44.160573744870156


In [15]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.48688057463737655


### R squared

In [16]:
regression_model.score(X_test, y_test)

0.6896762618840653

In [17]:
import statsmodels.formula.api as sm
lm = sm.ols(formula='covid_dum ~ alcohol_deaths + alcohol_sales + drug_deaths + homicides + suicide_deaths + perc_unemp + gdp_period + yearly_infl', data=df)
fit = lm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,covid_dum,R-squared:,0.712
Model:,OLS,Adj. R-squared:,0.702
Method:,Least Squares,F-statistic:,74.95
Date:,"Thu, 14 Jul 2022",Prob (F-statistic):,2.7899999999999998e-61
Time:,15:15:19,Log-Likelihood:,199.3
No. Observations:,252,AIC:,-380.6
Df Residuals:,243,BIC:,-348.8
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5084,0.117,4.362,0.000,0.279,0.738
alcohol_deaths,1.1202,0.133,8.411,0.000,0.858,1.382
alcohol_sales,0.0394,0.020,2.002,0.046,0.001,0.078
drug_deaths,0.3797,0.059,6.411,0.000,0.263,0.496
homicides,0.1119,0.059,1.909,0.057,-0.004,0.227
suicide_deaths,-0.4850,0.108,-4.487,0.000,-0.698,-0.272
perc_unemp,0.0119,0.003,3.681,0.000,0.006,0.018
gdp_period,-0.3037,0.029,-10.525,0.000,-0.361,-0.247
yearly_infl,36.6160,24.761,1.479,0.140,-12.157,85.389

0,1,2,3
Omnibus:,207.516,Durbin-Watson:,0.871
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5849.948
Skew:,2.963,Prob(JB):,0.0
Kurtosis:,25.848,Cond. No.,30200.0


In [18]:
X_test.head()

Unnamed: 0_level_0,alcohol_deaths,alcohol_sales,drug_deaths,homicides,suicide_deaths,perc_unemp,gdp_period,yearly_infl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-08-31,0.584914,2.872502,0.908476,0.0,0.930453,4.9,4.443535,0.001147
2020-12-31,1.399548,4.923888,2.387641,0.667069,1.072796,6.7,6.474031,0.000372
2019-04-30,0.951684,3.948267,1.788982,0.440459,1.228953,3.6,6.493798,0.000553
2013-06-30,0.726703,3.5914,1.23302,0.471169,1.101612,7.5,5.287847,0.000464
2007-08-31,0.633499,3.165174,1.093864,0.566833,0.994362,4.6,4.830553,0.000946


In [19]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

#y_test
dates = y_test[y_test["covid_dum"]==1]
print(dates)
#print(y_test)
#print(y_predict)

            covid_dum
date                 
2020-12-31          1
2020-11-30          1
2020-05-31          1
2020-09-30          1
2020-02-29          1


In [20]:
regression_model_mse = mean_squared_error(y_predict, y_test)

regression_model_mse

0.02744116100217801

### RMSE

In [21]:
import math

math.sqrt(regression_model_mse)

0.16565373826804516

In [22]:
X_test.head()

Unnamed: 0_level_0,alcohol_deaths,alcohol_sales,drug_deaths,homicides,suicide_deaths,perc_unemp,gdp_period,yearly_infl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-08-31,0.584914,2.872502,0.908476,0.0,0.930453,4.9,4.443535,0.001147
2020-12-31,1.399548,4.923888,2.387641,0.667069,1.072796,6.7,6.474031,0.000372
2019-04-30,0.951684,3.948267,1.788982,0.440459,1.228953,3.6,6.493798,0.000553
2013-06-30,0.726703,3.5914,1.23302,0.471169,1.101612,7.5,5.287847,0.000464
2007-08-31,0.633499,3.165174,1.093864,0.566833,0.994362,4.6,4.830553,0.000946


## 3. H1N1

In [23]:
y=df[['h1n1_dum']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [25]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression()

### MODEL

In [26]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for alcohol_deaths is -0.6947081842394899
The coefficient for alcohol_sales is 5.7144118834525554e-05
The coefficient for drug_deaths is 0.21042598944910892
The coefficient for homicides is 0.28341533525277746
The coefficient for suicide_deaths is -0.043137045160172686
The coefficient for perc_unemp is 0.011505596810656327
The coefficient for gdp_period is -0.13919331832987053
The coefficient for yearly_infl is -352.9220033697017


In [27]:
#regression_model.score(X_test, y_test)

### R-Squared

In [28]:
import statsmodels.formula.api as sm
lm = sm.ols(formula='h1n1_dum ~ alcohol_deaths+alcohol_sales+drug_deaths+homicides+suicide_deaths+perc_unemp+gdp_period+yearly_infl', data=df)
fit = lm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,h1n1_dum,R-squared:,0.346
Model:,OLS,Adj. R-squared:,0.324
Method:,Least Squares,F-statistic:,16.05
Date:,"Thu, 14 Jul 2022",Prob (F-statistic):,5.66e-19
Time:,15:15:20,Log-Likelihood:,76.067
No. Observations:,252,AIC:,-134.1
Df Residuals:,243,BIC:,-102.4
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9260,0.190,4.872,0.000,0.552,1.300
alcohol_deaths,-0.4092,0.217,-1.884,0.061,-0.837,0.019
alcohol_sales,-0.0142,0.032,-0.442,0.659,-0.077,0.049
drug_deaths,0.0765,0.097,0.792,0.429,-0.114,0.267
homicides,0.2837,0.096,2.969,0.003,0.095,0.472
suicide_deaths,-0.1524,0.176,-0.865,0.388,-0.500,0.195
perc_unemp,0.0133,0.005,2.523,0.012,0.003,0.024
gdp_period,-0.0819,0.047,-1.741,0.083,-0.175,0.011
yearly_infl,-315.2278,40.377,-7.807,0.000,-394.761,-235.694

0,1,2,3
Omnibus:,125.511,Durbin-Watson:,0.312
Prob(Omnibus):,0.0,Jarque-Bera (JB):,603.811
Skew:,2.033,Prob(JB):,7.660000000000001e-132
Kurtosis:,9.401,Cond. No.,30200.0


In [29]:
X_test.head()

Unnamed: 0_level_0,alcohol_deaths,alcohol_sales,drug_deaths,homicides,suicide_deaths,perc_unemp,gdp_period,yearly_infl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-08-31,0.584914,2.872502,0.908476,0.0,0.930453,4.9,4.443535,0.001147
2020-12-31,1.399548,4.923888,2.387641,0.667069,1.072796,6.7,6.474031,0.000372
2019-04-30,0.951684,3.948267,1.788982,0.440459,1.228953,3.6,6.493798,0.000553
2013-06-30,0.726703,3.5914,1.23302,0.471169,1.101612,7.5,5.287847,0.000464
2007-08-31,0.633499,3.165174,1.093864,0.566833,0.994362,4.6,4.830553,0.000946


In [30]:
y_predict = regression_model.predict(X_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse

0.04622869877143103

In [31]:
dates = y_test[y_test["h1n1_dum"]==1]
print(dates)

            h1n1_dum
date                
2009-09-30         1
2010-01-31         1
2009-12-31         1


In [32]:
math.sqrt(regression_model_mse)

0.21500860162196075

## 4. Both 

In [33]:
y=df[['pandemic']]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [35]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression()

In [36]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for alcohol_deaths is 0.3671295690517767
The coefficient for alcohol_sales is 0.04156517009568852
The coefficient for drug_deaths is 0.5772004600692374
The coefficient for homicides is 0.4097136256284899
The coefficient for suicide_deaths is -0.3717319719331063
The coefficient for perc_unemp is 0.025401975656761274
The coefficient for gdp_period is -0.46568231287374356
The coefficient for yearly_infl is -308.76142962483164


In [37]:
regression_model.score(X_test, y_test)

0.44556265696742825

In [38]:
import statsmodels.formula.api as sm
lm = sm.ols(formula='pandemic ~ alcohol_deaths+alcohol_sales+drug_deaths+homicides+suicide_deaths+perc_unemp+gdp_period+yearly_infl', data=df)
fit = lm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,pandemic,R-squared:,0.513
Model:,OLS,Adj. R-squared:,0.497
Method:,Least Squares,F-statistic:,31.95
Date:,"Thu, 14 Jul 2022",Prob (F-statistic):,5.11e-34
Time:,15:15:20,Log-Likelihood:,41.879
No. Observations:,252,AIC:,-65.76
Df Residuals:,243,BIC:,-33.99
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4344,0.218,6.590,0.000,1.006,1.863
alcohol_deaths,0.7109,0.249,2.858,0.005,0.221,1.201
alcohol_sales,0.0252,0.037,0.686,0.493,-0.047,0.098
drug_deaths,0.4562,0.111,4.124,0.000,0.238,0.674
homicides,0.3956,0.109,3.615,0.000,0.180,0.611
suicide_deaths,-0.6374,0.202,-3.158,0.002,-1.035,-0.240
perc_unemp,0.0252,0.006,4.174,0.000,0.013,0.037
gdp_period,-0.3856,0.054,-7.155,0.000,-0.492,-0.279
yearly_infl,-278.6118,46.244,-6.025,0.000,-369.701,-187.522

0,1,2,3
Omnibus:,97.817,Durbin-Watson:,0.464
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315.724
Skew:,1.684,Prob(JB):,2.76e-69
Kurtosis:,7.328,Cond. No.,30200.0


In [39]:
X_test.head()

Unnamed: 0_level_0,alcohol_deaths,alcohol_sales,drug_deaths,homicides,suicide_deaths,perc_unemp,gdp_period,yearly_infl
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-08-31,0.584914,2.872502,0.908476,0.0,0.930453,4.9,4.443535,0.001147
2020-12-31,1.399548,4.923888,2.387641,0.667069,1.072796,6.7,6.474031,0.000372
2019-04-30,0.951684,3.948267,1.788982,0.440459,1.228953,3.6,6.493798,0.000553
2013-06-30,0.726703,3.5914,1.23302,0.471169,1.101612,7.5,5.287847,0.000464
2007-08-31,0.633499,3.165174,1.093864,0.566833,0.994362,4.6,4.830553,0.000946


In [40]:
y_predict = regression_model.predict(X_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse

0.07332812226190107

In [41]:
dates = y_test[y_test["pandemic"]==1]
print(dates)

            pandemic
date                
2020-12-31         1
2020-11-30         1
2020-05-31         1
2009-09-30         1
2010-01-31         1
2009-12-31         1
2020-09-30         1
2020-02-29         1


In [42]:
math.sqrt(regression_model_mse)

0.2707916584053155