# Corona Virus Prediction

In [1]:
import pandas as pd
import numpy as np
import seaborn as sea
import matplotlib.pyplot as plt
import pycountry_convert as pc
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import folium
import branca

from datetime import datetime, timedelta,date
from scipy.interpolate import make_interp_spline, BSpline
from sklearn.preprocessing import MinMaxScaler
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

pio.templates.default = "plotly_white"
sea.set(style="darkgrid")

In [2]:
# The data is gathered from https://github.com/CSSEGISandData/COVID-19
!ls ../Data/

README.md                      [34mcsse_covid_19_data[m[m
[34marchived_data[m[m                  [34mwho_covid_19_situation_reports[m[m


In [3]:
# Time Series Data
time_series_confirmed = pd.read_csv("../Data/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
time_series_deaths = pd.read_csv("../Data/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")

## Utils

In [4]:
# Number of previous data points to use to forecast
N = 5

In [5]:
# Get unique conutries
countries = time_series_confirmed[['Country/Region']].drop_duplicates()
countries.head()

Unnamed: 0,Country/Region
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [6]:
# Get train and testing data
def get_test_train_data(cases):
    cases = cases.groupby("Country/Region").sum()
    cases.drop( columns={"Lat", "Long"}, inplace=True)
    cases.reset_index(inplace=True)

    # Converting dataframe
    df = cases.melt(id_vars=["Country/Region"], 
            var_name="Date", 
            value_name="Cases")
    df['Date'] = pd.to_datetime(df['Date'])

    # Dividing into test and train set
    the_date = pd.Timestamp(2020, 5, 10)
    train = df[ df.Date < the_date]
    test = df[ df.Date >= the_date]
    return train, test

def get_test_train_country_data(cases, country):
    cases = cases.groupby("Country/Region").sum().loc[country][2:]

    # Converting dataframe
    df = cases.to_frame().reset_index()
    df = df.rename(columns={"index":"Date", "India":"Cases"})
    df['Date'] = pd.to_datetime(df['Date'])

    # Dividing into test and train set
    the_date = pd.Timestamp(2020, 5, 10)
    train = df[ df.Date < the_date]
    test = df[ df.Date >= the_date]
    return train, test

In [7]:
# Get the prediciton at timestep t using the values from t-1 ... t-H
# Get the prediction for next H days

def get_predictions(series, pred_min, H):

    pred_list = []
    linearReg = LinearRegression(fit_intercept=True)

    X_train = np.array(range(len(series)))
    y_train = np.array(series)
    X_train = X_train.reshape(-1, 1)
    y_train = y_train.reshape(-1, 1)
    linearReg.fit(X_train, y_train)
    predict_vals = np.array(range(len(series),len(series)+H)).reshape(-1,1)
    pred = linearReg.predict(predict_vals)
    pred = pred.reshape(H,)
    pred[pred < pred_min] = pred_min  
    return pred.astype(int)

## Predicting confirmed cases

In [52]:
train,test = get_test_train_data(time_series_confirmed)
train.head()

Unnamed: 0,Country/Region,Date,Cases
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


In [53]:
test.head()

Unnamed: 0,Country/Region,Date,Cases
20383,Afghanistan,2020-05-10,4402
20384,Albania,2020-05-10,868
20385,Algeria,2020-05-10,5723
20386,Andorra,2020-05-10,755
20387,Angola,2020-05-10,45


In [54]:
# Number of days to predict
start = min(test["Date"])
end = max(test["Date"])
no_days_prediction = (end-start).days+1

In [55]:
# Prediction of confirmed cases of COVID-19
cr_list = []
date_list = []
confirmedcasespred_list = []

for index, row in countries.iterrows():
    train_temp = train[train['Country/Region']==row['Country/Region']]
    preds = get_predictions(train_temp['Cases'][-N:], 0, no_days_prediction)
    cr_list = cr_list + ([row['Country/Region']]*no_days_prediction)
    date_list = date_list + list(pd.date_range(start, end).strftime("%m/%d/%Y"))
    confirmedcasespred_list = confirmedcasespred_list + list(preds)

results = pd.DataFrame({'Country/Region': cr_list,
                        'Date': date_list,
                        'Confirmed_Cases': confirmedcasespred_list})
results['Date'] = pd.to_datetime(results['Date'], format="%m/%d/%Y")

In [56]:
# Root mean squared logrithmic error for the model
y_pred = results.sort_values(["Country/Region","Date"])["Confirmed_Cases"].values
y_test = test.sort_values(["Country/Region","Date"])["Cases"].values
error = np.sqrt(mean_squared_log_error(y_test,y_pred))
print(error)

0.06871055226664113


In [57]:
fig = go.Figure()

result_cases = results.groupby("Date").sum()
fig.add_trace(go.Scatter(x=result_cases.index, y=result_cases["Confirmed_Cases"].values,
                    mode='lines+markers',
                    name='Confirmed'))
fig.update_layout(title='COVID-19 Confirmed Cases Prediction',
                   xaxis_title='Days',
                   yaxis_title='Confirmed Cases')
fig.show()

In [63]:
fig = go.Figure()

result_cases_us = results[results["Country/Region"] == "US"]


fig.add_trace(go.Scatter(x=result_cases_us["Date"], y=result_cases_us["Confirmed_Cases"].values,
                    mode='lines+markers',
                    name='India'))
fig.update_layout(title='COVID-19 Confirmed Cases Prediction US',
                   xaxis_title='Days',
                   yaxis_title='Confirmed Cases')
fig.show()

In [64]:
fig = go.Figure()

result_cases_in = results[results["Country/Region"] == "India"]
result_cases_ch = results[results["Country/Region"] == "China"]


fig.add_trace(go.Scatter(x=result_cases_in["Date"], y=result_cases_in["Confirmed_Cases"].values,
                    mode='lines+markers',
                    name='India'))
fig.add_trace(go.Scatter(x=result_cases_ch["Date"], y=result_cases_ch["Confirmed_Cases"].values,
                    mode='lines+markers',
                    name='China'))
fig.update_layout(title='COVID-19 Confirmed Cases Prediction India and China',
                   xaxis_title='Days',
                   yaxis_title='Confirmed Cases')
fig.show()

## Predicting deaths 

In [66]:
train,test = get_test_train_data(time_series_deaths)
train.head()

Unnamed: 0,Country/Region,Date,Cases
0,Afghanistan,2020-01-22,0
1,Albania,2020-01-22,0
2,Algeria,2020-01-22,0
3,Andorra,2020-01-22,0
4,Angola,2020-01-22,0


In [67]:
test.head()

Unnamed: 0,Country/Region,Date,Cases
20383,Afghanistan,2020-05-10,120
20384,Albania,2020-05-10,31
20385,Algeria,2020-05-10,502
20386,Andorra,2020-05-10,48
20387,Angola,2020-05-10,2


In [68]:
# Number of days to predict
start = min(test["Date"])
end = max(test["Date"])
no_days_prediction = (end-start).days+1

In [69]:
# Prediction of deaths of COVID-19
cr_list = []
date_list = []
confirmedcasespred_list = []

for index, row in countries.iterrows():
    train_temp = train[train['Country/Region']==row['Country/Region']]
    preds = get_predictions(train_temp['Cases'][-N:], 0, no_days_prediction)
    cr_list = cr_list + ([row['Country/Region']]*no_days_prediction)
    date_list = date_list + list(pd.date_range(start, end).strftime("%m/%d/%Y"))
    confirmedcasespred_list = confirmedcasespred_list + list(preds)

results = pd.DataFrame({'Country/Region': cr_list,
                        'Date': date_list,
                        'Deaths': confirmedcasespred_list})
results['Date'] = pd.to_datetime(results['Date'], format="%m/%d/%Y")

In [70]:
# Root mean squared logrithmic error for the model
y_pred = results.sort_values(["Country/Region","Date"])["Deaths"].values
y_test = test.sort_values(["Country/Region","Date"])["Cases"].values
error = np.sqrt(mean_squared_log_error(y_test,y_pred))
print(error)

0.054708482469718184


In [71]:
fig = go.Figure()

result_cases = results.groupby("Date").sum()
fig.add_trace(go.Scatter(x=result_cases.index, y=result_cases["Deaths"].values,
                    mode='lines+markers',
                    name='Confirmed'))
fig.update_layout(title='COVID-19 Deaths Prediction',
                   xaxis_title='Days',
                   yaxis_title='Deaths')
fig.show()

In [72]:
fig = go.Figure()

result_cases_us = results[results["Country/Region"] == "US"]


fig.add_trace(go.Scatter(x=result_cases_us["Date"], y=result_cases_us["Deaths"].values,
                    mode='lines+markers',
                    name='India'))
fig.update_layout(title='COVID-19 Deaths Prediction US',
                   xaxis_title='Days',
                   yaxis_title='Deaths')
fig.show()

In [73]:
fig = go.Figure()

result_cases_in = results[results["Country/Region"] == "India"]
result_cases_ch = results[results["Country/Region"] == "China"]


fig.add_trace(go.Scatter(x=result_cases_in["Date"], y=result_cases_in["Deaths"].values,
                    mode='lines+markers',
                    name='India'))
fig.add_trace(go.Scatter(x=result_cases_ch["Date"], y=result_cases_ch["Deaths"].values,
                    mode='lines+markers',
                    name='China'))
fig.update_layout(title='COVID-19 Deaths Prediction India and China',
                   xaxis_title='Days',
                   yaxis_title='Deaths')
fig.show()