In [38]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
import pandas as pd
import numpy as np

# read the case time series data
cases = pd.read_csv("data/case_time_series.csv")
cases['Case_Date'] = pd.to_datetime(cases['Date_YMD'], format='%Y-%m-%d')

# cases.head()

# lets remove the data for last 3 days so we can predict
cases.drop(cases.tail(3).index, inplace = True)

# cases.tail()

Unnamed: 0,Date,Date_YMD,Daily Confirmed,Total Confirmed,Daily Recovered,Total Recovered,Daily Deceased,Total Deceased,Case_Date
459,3 May 2021,2021-05-03,355769,20275414,318910,16600620,3439,221795,2021-05-03
460,4 May 2021,2021-05-04,382847,20658261,337699,16938319,3786,225581,2021-05-04
461,5 May 2021,2021-05-05,412624,21070885,330718,17269037,3979,229560,2021-05-05
462,6 May 2021,2021-05-06,414280,21485165,328347,17597384,3923,233483,2021-05-06
463,7 May 2021,2021-05-07,406902,21892067,327675,17925059,4233,237716,2021-05-07


In [73]:
tests = pd.read_csv("data/tested_numbers_icmr_data.csv")
tests['Testing_Date'] = pd.to_datetime([i.split(' ', 1)[0] for i in tests['Update Time Stamp']], infer_datetime_format=True)
tests.tail()


Unnamed: 0,Update Time Stamp,Tested As Of,Daily RTPCR Samples Collected_ICMR Application,Total RTPCR Samples Collected_ICMR Application,Total Samples Tested,Total Individuals Tested,Total Positive Cases,Tests conducted by Private Labs,Positive cases from samples reported,Sample Reported today,...,Registration (Above 45 Years),Total Doses Administered,Total Individuals Vaccinated,Total Sessions Conducted,Total Individuals Registered,Source,Source 2,Source 3,Source 4,Testing_Date
422,07/05/2021 09:00:00,6/5/2021,1001025.0,144569326.0,298601699.0,,,,,1826490,...,128634493.0,164973058.0,132087824.0,2411300.0,184811384.0,https://twitter.com/ICMRDELHI/status/139050530...,,,https://pib.gov.in/PressReleasePage.aspx?PRID=...,2021-05-07
423,08/05/2021 09:00:00,7/5/2021,980291.0,145549617.0,300410043.0,,,,,1808344,...,129757035.0,167346544.0,133112446.0,2437299.0,187851699.0,https://twitter.com/ICMRDELHI/status/139087731...,,,https://pib.gov.in/PressReleasePage.aspx?PRID=...,2021-05-08
424,09/05/2021 09:00:00,8/5/2021,934541.0,146484158.0,302275471.0,,,,,1865428,...,130795455.0,169439663.0,133980544.0,2454021.0,191070066.0,https://twitter.com/ICMRDELHI/status/139123636...,,,https://twitter.com/MoHFW_INDIA/status/1391290...,2021-05-09
425,10/05/2021 09:00:00,9/5/2021,580038.0,147064196.0,303750077.0,,,,,1474606,...,131359680.0,170176603.0,134404867.0,2470799.0,193769891.0,https://twitter.com/ICMRDELHI/status/139159328...,,,https://pib.gov.in/PressReleasePage.aspx?PRID=...,2021-05-10
426,11/05/2021 09:00:00,10/5/2021,1013002.0,148077198.0,305600187.0,,,,,1850110,...,132508810.0,172710066.0,135494474.0,2515519.0,197663279.0,https://twitter.com/ICMRDELHI/status/139195637...,,,https://pib.gov.in/PressReleasePage.aspx?PRID=...,2021-05-11


In [74]:
# there are some duplicate rows in tests - drop them
tests.drop_duplicates(subset=['Testing_Date'], inplace=True)
tests.head()

Unnamed: 0,Update Time Stamp,Tested As Of,Daily RTPCR Samples Collected_ICMR Application,Total RTPCR Samples Collected_ICMR Application,Total Samples Tested,Total Individuals Tested,Total Positive Cases,Tests conducted by Private Labs,Positive cases from samples reported,Sample Reported today,...,Registration (Above 45 Years),Total Doses Administered,Total Individuals Vaccinated,Total Sessions Conducted,Total Individuals Registered,Source,Source 2,Source 3,Source 4,Testing_Date
0,13/03/2020 00:00:00,13/03/2020,,,6500.0,5900.0,78,,,,...,,,,,,Press_Release_ICMR_13March2020.pdf,,,,2020-03-13
1,18/03/2020 18:00:00,18/03/2020,,,13125.0,12235.0,150,,,,...,,,,,,ICMR_website_update_18March_6PM_IST.pdf,,,,2020-03-18
2,19/03/2020 10:00:00,19/03/2020,,,13316.0,12426.0,168,,,,...,,,,,,ICMR_website_update_19March_10AM_IST_V2.pdf,,,,2020-03-19
4,20/03/2020 10:00:00,20/03/2020,,,14376.0,13486.0,206,,,,...,,,,,,ICMR_website_update_20March_10AM_IST.pdf,,,,2020-03-20
6,21/03/2020 10:00:00,21/03/2020,,,15701.0,14811.0,271,,,,...,,,,,,ICMR_website_update_21March_10AM_IST.pdf,,,,2020-03-21


In [75]:
tests.drop(tests.tail(3).index, inplace = True)
tests.dropna(subset=["Daily RTPCR Samples Collected_ICMR Application"], inplace=True)
tests['Daily_Tests'] = tests['Daily RTPCR Samples Collected_ICMR Application']

In [76]:
# lets join both by dates
result = pd.merge(cases[['Case_Date', 'Daily Confirmed', 'Daily Recovered']],
                  tests[['Testing_Date', 'Daily_Tests']],
                  left_on='Case_Date',
                  right_on='Testing_Date')
result.tail()

Unnamed: 0,Case_Date,Daily Confirmed,Daily Recovered,Testing_Date,Daily_Tests
335,2021-05-03,355769,318910,2021-05-03,584547.0
336,2021-05-04,382847,337699,2021-05-04,1065669.0
337,2021-05-05,412624,330718,2021-05-05,1037883.0
338,2021-05-06,414280,328347,2021-05-06,1014480.0
339,2021-05-07,406902,327675,2021-05-07,1001025.0


In [98]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

features = ['Daily_Tests']
target = ['Daily Confirmed']

x_train = result[features]
y_train = result[target]

x_train.shape, y_train.shape

model.fit(x_train, y_train)

# tests for next 3 days = 934541, 580038, 1013002
# confirmed for next 3 days = 403808, 366455, 329491

x_test = [[934541], [580038], [1013002]]

y_pred = model.predict(x_test)
y_pred

array([[221985.39353552],
       [112872.33748296],
       [246135.03258212]])

In [99]:
y_test = [403808, 366455, 329491]

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute error of our model", mae)

Mean absolute error of our model 172920.4121331323


In [107]:
# the second wave has behaved differently due to mutations and new strains
# we will limit the dataframe to after march this year

filteredResults = result[result['Case_Date'] >= '2021-03-01']
filteredResults.shape


from sklearn.linear_model import LinearRegression
model = LinearRegression()

features = ['Daily_Tests']
target = ['Daily Confirmed']

x_train = filteredResults[features]
y_train = filteredResults[target]

x_train.shape, y_train.shape

model.fit(x_train, y_train)

# tests for next 3 days = 934541, 580038, 1013002
# confirmed for next 3 days = 403808, 366455, 329491

x_test = [[1013002]]

y_pred = model.predict(x_test)
y_pred[0][0]

328187.01576283155

In [106]:
y_test = [329491]

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute error of our model", mae)

Mean absolute error of our model 1303.98423716845
