In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error  
from sklearn.preprocessing import LabelEncoder


In [4]:
data = pd.read_csv("/Users/mehrac/Desktop/DSA-Python/covid_19_data.csv")
df = data.copy()
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6722 entries, 0 to 6721
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SNo              6722 non-null   int64  
 1   ObservationDate  6722 non-null   object 
 2   Province/State   3953 non-null   object 
 3   Country/Region   6722 non-null   object 
 4   Last Update      6722 non-null   object 
 5   Confirmed        6722 non-null   float64
 6   Deaths           6722 non-null   float64
 7   Recovered        6722 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 420.2+ KB


In [8]:
df.describe()

Unnamed: 0,SNo,Confirmed,Deaths,Recovered
count,6722.0,6722.0,6722.0,6722.0
mean,3361.5,601.195924,19.855846,226.341267
std,1940.618587,4896.33214,204.486922,2556.035202
min,1.0,0.0,0.0,0.0
25%,1681.25,2.0,0.0,0.0
50%,3361.5,13.0,0.0,0.0
75%,5041.75,108.0,1.0,11.0
max,6722.0,67800.0,3122.0,56927.0


In [9]:
df.isnull().sum()

SNo                   0
ObservationDate       0
Province/State     2769
Country/Region        0
Last Update           0
Confirmed             0
Deaths                0
Recovered             0
dtype: int64

In [10]:
df.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

In [20]:
cols = df.columns
df.columns = [col.lower() for col in cols]
df.columns
df.rename(columns={"observationdate" : "observation_date",
                   "country/region"  : "country",
                   "province/state" : "province",
                   "last update" : "last_update",
}, inplace=True)
df.head()

Unnamed: 0,sno,observation_date,province,country,last_update,confirmed,deaths,recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [23]:
df_pure = df.drop(["sno", "province", "country", "last_update"], axis=1)

In [24]:
df_pure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6722 entries, 0 to 6721
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   observation_date  6722 non-null   object 
 1   confirmed         6722 non-null   float64
 2   deaths            6722 non-null   float64
 3   recovered         6722 non-null   float64
dtypes: float64(3), object(1)
memory usage: 210.2+ KB


In [25]:
df_pure = df_pure.groupby(["observation_date"]).sum().reset_index() 

In [26]:
df_pure["observation_date"] = pd.to_datetime(df_pure["observation_date"])

In [27]:
df_pure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   observation_date  57 non-null     datetime64[ns]
 1   confirmed         57 non-null     float64       
 2   deaths            57 non-null     float64       
 3   recovered         57 non-null     float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 1.9 KB


In [29]:
df_pure_date_max = df_pure["observation_date"].max()
df_pure_date_min = df_pure["observation_date"].min()

def day_substraction(observation_date):
    return (observation_date - df_pure_date_min).days + 1

 

In [30]:
df_pure["day_counter"] = df_pure["observation_date"].apply(lambda x : day_substraction(x))
df_pure.drop("observation_date", axis=1, inplace=True)
df_pure.head(50)

Unnamed: 0,confirmed,deaths,recovered,day_counter
0,555.0,17.0,28.0,1
1,653.0,18.0,30.0,2
2,941.0,26.0,36.0,3
3,1438.0,42.0,39.0,4
4,2118.0,56.0,52.0,5
5,2927.0,82.0,61.0,6
6,5578.0,131.0,107.0,7
7,6165.0,133.0,126.0,8
8,8235.0,171.0,143.0,9
9,9925.0,213.0,222.0,10


In [31]:
df_pure["infected"] = df_pure["confirmed"] - df_pure["recovered"] - df_pure["deaths"]
df_pure

Unnamed: 0,confirmed,deaths,recovered,day_counter,infected
0,555.0,17.0,28.0,1,510.0
1,653.0,18.0,30.0,2,605.0
2,941.0,26.0,36.0,3,879.0
3,1438.0,42.0,39.0,4,1357.0
4,2118.0,56.0,52.0,5,2010.0
5,2927.0,82.0,61.0,6,2784.0
6,5578.0,131.0,107.0,7,5340.0
7,6165.0,133.0,126.0,8,5906.0
8,8235.0,171.0,143.0,9,7921.0
9,9925.0,213.0,222.0,10,9490.0


Lets Predict "Confirmed"

In [35]:
y =df_pure["confirmed"]
X = df_pure[["day_counter"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)


Model

In [36]:
model = LinearRegression()
lin_model = model.fit(X_train, y_train)
lin_model

Prediction and Metrics

In [37]:
y_pred = lin_model.predict(X_test)

In [39]:
R2 = r2_score(y_test, y_pred)
n = X_test.shape[0] #sample size
p = X_test.shape[1] #number of predictions
adj_R2 = 1 - (1 - R2) * (n - 1)/(n - p - 1)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_test, y_pred)

print("R^2 =", R2)
print("adjusted R^2 =", adj_R2)
print("MSE: ", MSE)
print("RMSE:", RMSE)
print("MAE :", MAE)

R^2 = 0.9081493592042216
adjusted R^2 = 0.8989642951246437
MSE:  181528979.18849102
RMSE: 13473.269060940296
MAE : 9892.347140961212


Predict next 7 days

In [41]:
next_7_days = np.array([51, 52, 53, 54, 55, 56, 57])
lin_model.predict(next_7_days.reshape(-1, 1))



array([138135.84478579, 141198.32745711, 144260.81012843, 147323.29279974,
       150385.77547106, 153448.25814238, 156510.7408137 ])

Predicting Deaths


In [44]:
y =df_pure["deaths"]
X = df_pure[["day_counter"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)

lin_reg = LinearRegression()
lin_model = lin_reg.fit(X_train, y_train)
lin_model

In [46]:
y_pred = lin_model.predict(X_test)

R2 = r2_score(y_test, y_pred)
n = X_test.shape[0] #sample size
p = X_test.shape[1] #number of predictions
adj_R2 = 1 - (1 - R2) * (n - 1)/(n - p - 1)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_test, y_pred)

print("R^2 =", R2)
print("adjusted R^2 =", adj_R2)
print("MSE: ", MSE)
print("RMSE:", RMSE)
print("MAE :", MAE)

R^2 = 0.862565303753617
adjusted R^2 = 0.8488218341289786
MSE:  468237.21560593665
RMSE: 684.2786096364088
MAE : 551.8960403393712


Predicting Recovered

In [47]:
y =df_pure["recovered"]
X = df_pure[["day_counter"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)

lin_reg = LinearRegression()
lin_model = lin_reg.fit(X_train, y_train)
lin_model 

In [48]:
y_pred = lin_model.predict(X_test)

R2 = r2_score(y_test, y_pred)
p = X_test.shape[1] #number of predictions
adj_R2 = 1 - (1 - R2) * (n - 1)/(n - p - 1)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_test, y_pred)

print("R^2 =", R2)
print("adjusted R^2 =", adj_R2)
print("MSE: ", MSE)
print("RMSE:", RMSE)
print("MAE :", MAE)

R^2 = 0.8707546732091536
adjusted R^2 = 0.8578301405300689
MSE:  90873569.84641373
RMSE: 9532.762970220845
MAE : 8788.989756344141


Predicting Infected

In [49]:
y =df_pure["infected"]
X = df_pure[["day_counter"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)

lin_reg = LinearRegression()
lin_model = lin_reg.fit(X_train, y_train)
lin_model 

In [50]:
y_pred = lin_model.predict(X_test)

R2 = r2_score(y_test, y_pred)
p = X_test.shape[1] #number of predictions
adj_R2 = 1 - (1 - R2) * (n - 1)/(n - p - 1)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_test, y_pred)

print("R^2 =", R2)
print("adjusted R^2 =", adj_R2)
print("MSE: ", MSE)
print("RMSE:", RMSE)
print("MAE :", MAE)

R^2 = 0.4772961052709054
adjusted R^2 = 0.425025715797996
MSE:  237295853.22212473
RMSE: 15404.410187414665
MAE : 13951.213674016859
