In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as MSE
import numpy as np
import statsmodels.api as sm

In [2]:
df = pd.read_csv('Vietnam_Corona_Worldometer.csv')
df = pd.DataFrame(df).rename(columns={'Unnamed: 0':'Day'})
df.head()

Unnamed: 0,Day,Date,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",Tot Cases/1M pop,Deaths/1M pop,Total Tests,Tests/1M pop,1 Case every X ppl,1 Death every X ppl,1 Test every X ppl,Tot Cases/1M ppl
0,0,Date0809,841,29,11,1,395,0,435,0,9,1,482456,4952,115850,8857296,202,0
1,1,Date0810,841,0,13,2,399,4,429,0,9,0,482456,4952,115853,7494819,202,0
2,2,Date0811,866,19,16,1,399,0,451,0,9,2,482456,4951,112517,6089988,202,0
3,3,Date0812,883,17,17,1,409,10,457,0,9,2,621823,6382,110351,5731754,157,0
4,4,Date0813,911,28,21,4,425,16,465,0,9,2,621823,6381,106962,4640105,157,0


In [3]:
data = df['Total Cases']

In [4]:
X = []
y = []
d = 7

for i in range(0, 30, 1):
    x_tmp = []
    for j in range(0, d, 1):
        x_tmp.append(data[i + j])
    y.append(data[i + d])
    X.append(x_tmp)

In [5]:
X_test = []
X_test.append(np.array(data[-d:]))
X_test.append([1060, 1063, 1063, 1063, 1063, 1066, 1068])
y_test = [1068, 1068]

# Functions

In [6]:
def predictions(model, X_test):
    res = model.predict(X_test).round()
    for i in range(len(res)):
        if res[i] < X_test[i][-1]:
            res[i] = X_test[i][-1]
    return res
def printFormula(coef_, intercept_):
    col_name = ["Day_7", "Day_6", "Day_5", "Day_4", "Day_3", "Day_2", "Day_1"]
    formula = str(round(intercept_, 2))
    for i in range(len(col_name)):
        formula = formula + " + " + str(round(coef_[i], 2)) + "*" + col_name[i]
    return formula

# Sklearn

In [7]:
#sklearn
model1 = LR().fit(X, y)

In [8]:
print("Formula of model")
print(printFormula(model1.coef_, model1.intercept_))

Formula of model
217.78 + 0.02*Day_7 + -0.2*Day_6 + 0.36*Day_5 + 0.11*Day_4 + -0.15*Day_3 + 0.06*Day_2 + 0.6*Day_1


In [9]:
y_pred1 = predictions(model1, X)
mse1 = MSE(y_pred1, y)
mse1

7.6

In [10]:
predictions1 = predictions(model1, X_test)
predictions1

array([1066., 1068.])

# Statsmodels.api

In [11]:
#statsmodels.api
model2 = sm.OLS(y, X).fit()

In [12]:
print("Formula of model")
print(printFormula(model2.params, 0))

Formula of model
0 + -0.09*Day_7 + -0.36*Day_6 + 0.43*Day_5 + 0.09*Day_4 + -0.12*Day_3 + 0.17*Day_2 + 0.87*Day_1


In [13]:
y_pred2 = predictions(model2, X)
mse2 = MSE(y_pred2, y)
mse2

10.666666666666666

In [14]:
predictions2 = predictions(model2, X_test)
predictions2

array([1068., 1069.])

# Hypothesis test

In [15]:
model2.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,345600.0
Date:,"Sun, 20 Sep 2020",Prob (F-statistic):,3.39e-56
Time:,00:16:06,Log-Likelihood:,-77.315
No. Observations:,30,AIC:,168.6
Df Residuals:,23,BIC:,178.4
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0857,0.112,-0.768,0.450,-0.317,0.145
x2,-0.3595,0.185,-1.942,0.065,-0.742,0.023
x3,0.4302,0.216,1.995,0.058,-0.016,0.876
x4,0.0929,0.231,0.402,0.691,-0.385,0.571
x5,-0.1191,0.235,-0.507,0.617,-0.605,0.367
x6,0.1702,0.235,0.724,0.476,-0.316,0.656
x7,0.8718,0.165,5.298,0.000,0.531,1.212

0,1,2,3
Omnibus:,3.441,Durbin-Watson:,2.545
Prob(Omnibus):,0.179,Jarque-Bera (JB):,1.993
Skew:,-0.427,Prob(JB):,0.369
Kurtosis:,3.93,Cond. No.,1250.0
