## ARIMA Time Series Modeling for XPRIZE competition

In [32]:
import numpy as np
import pandas as pd
import scipy
from datetime import datetime
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA


import warnings
warnings.filterwarnings('ignore')
import glob

%matplotlib inline

In [2]:
base = dt.datetime.today() - dt.timedelta(days=1)
numdays = ( dt.datetime.today() - dt.datetime(2020,1,22)).days
date_list = [ (base - dt.timedelta(days=x)).strftime("%m-%d-%Y") for x in range(numdays)]
#print(date_list)

In [3]:
import glob

# get data file names
path =r'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/'

dfs = []
for date in date_list:    
    fullpath = path + date + ".csv"
    #print (fullpath)
    dfs.append(pd.read_csv(fullpath))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)    

In [4]:
big_frame.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,...,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Incidence_Rate,Case-Fatality_Ratio,Province/State,Country/Region,Last Update,Latitude,Longitude
0,,,,Afghanistan,2020-12-21 05:27:58,33.93911,67.709953,49817.0,2067.0,39006.0,...,Afghanistan,127.971033,4.149186,,,,,,,
1,,,,Albania,2020-12-21 05:27:58,41.1533,20.1683,53003.0,1088.0,28121.0,...,Albania,1841.788866,2.052714,,,,,,,
2,,,,Algeria,2020-12-21 05:27:58,28.0339,1.6596,95203.0,2666.0,63260.0,...,Algeria,217.10544,2.800332,,,,,,,
3,,,,Andorra,2020-12-21 05:27:58,42.5063,1.5218,7577.0,81.0,6997.0,...,Andorra,9806.510063,1.069025,,,,,,,
4,,,,Angola,2020-12-21 05:27:58,-11.2027,17.8739,16644.0,387.0,9592.0,...,Angola,50.641588,2.325162,,,,,,,


In [5]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1027155 entries, 0 to 1027154
Data columns (total 21 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   FIPS                 856594 non-null   float64
 1   Admin2               858152 non-null   object 
 2   Province_State       972520 non-null   object 
 3   Country_Region       1019538 non-null  object 
 4   Last_Update          1019538 non-null  object 
 5   Lat                  999388 non-null   float64
 6   Long_                999388 non-null   float64
 7   Confirmed            1027136 non-null  float64
 8   Deaths               1026713 non-null  float64
 9   Recovered            1026764 non-null  float64
 10  Active               1019076 non-null  float64
 11  Combined_Key         1019538 non-null  object 
 12  Incident_Rate        163307 non-null   float64
 13  Case_Fatality_Ratio  165121 non-null   float64
 14  Incidence_Rate       624438 non-null   float64
 15

In [6]:
big_frame = big_frame[big_frame.Province_State != "Province_State"]

In [7]:
big_frame['Last_Update'] = big_frame['Last_Update'].astype('datetime64[ns]').dt.date

In [8]:
df_sum = big_frame.groupby(["Country_Region","Last_Update"],as_index=False).sum()
df_sum.head()

Unnamed: 0,Country_Region,Last_Update,FIPS,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,Incidence_Rate,Case-Fatality_Ratio,Latitude,Longitude
0,Afghanistan,2020-03-22,0.0,33.93911,67.709953,34.0,1.0,1.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,2020-03-23,0.0,33.93911,67.709953,41.0,1.0,1.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Afghanistan,2020-03-24,0.0,33.93911,67.709953,43.0,1.0,1.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,2020-03-25,0.0,33.93911,67.709953,76.0,2.0,2.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Afghanistan,2020-03-26,0.0,33.93911,67.709953,80.0,3.0,2.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_sum = df_sum.assign(rec_id=np.arange(len(df_sum))).reset_index(drop=True)
df_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51435 entries, 0 to 51434
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country_Region       51435 non-null  object 
 1   Last_Update          51435 non-null  object 
 2   FIPS                 51435 non-null  float64
 3   Lat                  51435 non-null  float64
 4   Long_                51435 non-null  float64
 5   Confirmed            51435 non-null  float64
 6   Deaths               51435 non-null  float64
 7   Recovered            51435 non-null  float64
 8   Active               51435 non-null  float64
 9   Incident_Rate        51435 non-null  float64
 10  Case_Fatality_Ratio  51435 non-null  float64
 11  Incidence_Rate       51435 non-null  float64
 12  Case-Fatality_Ratio  51435 non-null  float64
 13  Latitude             51435 non-null  float64
 14  Longitude            51435 non-null  float64
 15  rec_id               51435 non-null 

In [10]:
df_sum.drop(columns=['FIPS',"Lat","Long_","Incident_Rate", "Incidence_Rate","Case_Fatality_Ratio", "Case-Fatality_Ratio","Latitude","Longitude",],inplace=True)

In [11]:
Country_Region = df_sum.Country_Region.str.strip()
df_sum.update(pd.DataFrame(Country_Region,columns=["Country_Region"]))

In [12]:
C1 = df_sum.Country_Region.str.replace("*","")
C2 = C1.str.replace(")","")
C3 = C2.str.replace("(","")
df_sum.update(pd.DataFrame(C3,columns=["Country_Region"]))

In [13]:
df_sum.sort_values(["Country_Region","Last_Update"],inplace=True)

In [14]:
df_sum.rename(columns= {"Last_Update":"Dated", "Confirmed":"total_cases","Deaths":"total_deaths"},inplace=True)

In [15]:
df_clean = df_sum.groupby(["Country_Region","Dated"],as_index=False).sum()

In [16]:
df_clean = df_clean[df_clean.Country_Region.notnull()]

In [17]:
df_clean.sort_values(["Dated"],inplace=True)

In [18]:
df_clean.head()

Unnamed: 0,Country_Region,Dated,total_cases,total_deaths,Recovered,Active,rec_id
9841,China,2020-02-23,703.0,0.0,703.0,0.0,9841
9842,China,2020-03-08,20632.0,219.0,20413.0,0.0,9842
9843,China,2020-03-12,279.0,3.0,276.0,0.0,9843
9844,China,2020-03-13,266.0,0.0,266.0,0.0,9844
9845,China,2020-03-14,12471.0,66.0,12404.0,1.0,9845


In [19]:
countries = pd.unique(df_clean.Country_Region)
#countries

In [20]:
df_clean.set_index(pd.to_datetime(df_clean.Dated), inplace=True)
df_clean.drop(columns = ['Dated'],inplace=True)

In [61]:

# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
	# prepare training dataset
	train_size = int(len(X) * 0.66)
	train, test = X[0:train_size], X[train_size:]
	history = [x for x in train]
	# make predictions
	predictions = list()
	for t in range(len(test)):
		model = ARIMA(history, order=arima_order)
		model_fit = model.fit()
		yhat = model_fit.forecast()[0]
		predictions.append(yhat)
		history.append(test[t])
	# calculate out of sample error
	error = mean_squared_error(test, predictions)
	return error

def evaluate_models(dataset, p_values, d, q_values):
	dataset = dataset.astype('float32')
	best_score, best_cfg = float("inf"), None
	for p in p_values:
			for q in q_values:
				order = (p,d,q)
				try:
					mse = evaluate_arima_model(dataset, order)
					if mse < best_score:
						best_score, best_cfg = mse, order
					#print('ARIMA%s MSE=%.3f' % (order,mse))
				except:
					continue   
	return best_cfg #print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))



In [77]:
out = pd.DataFrame()

for i in ['China','Canada']: #loop over each countries data
    df = df_clean[df_clean['Country_Region']==i]
    df.sort_values(["Dated"],inplace=True)
    df = df.cumsum()    
    df.drop(columns = ['rec_id'],inplace=True)
    df = df.resample('D').ffill()
    df.Country_Region = i  
    
    df['new_cases'] = df.total_cases -  df.total_cases.shift()

    model1 = ARIMA(df.new_cases[1:], order= evaluate_models(df.new_cases[1:],[1,2,3],0,[1,2,3]))
    model1_fit = model1.fit()
    predictions = model1_fit.predict(start=pd.to_datetime("12-22-2020"),end=pd.to_datetime("2-1-2021"))

    finaldf = pd.DataFrame(predictions,columns = ['PredictedDailyNewCases'])
    finaldf.reset_index(inplace=True)
    finaldf.rename(columns = {"index":"Date"},inplace=True)
    finaldf['CountryName'] = i
    finaldf['RegionName'] = None
    finaldf = finaldf[['CountryName','RegionName','Date','PredictedDailyNewCases']]
    out = pd.concat([out,finaldf])

In [78]:
out

Unnamed: 0,CountryName,RegionName,Date,PredictedDailyNewCases
0,China,,2020-12-22,94678.428884
1,China,,2020-12-23,94582.649863
2,China,,2020-12-24,94534.827190
3,China,,2020-12-25,94444.252375
4,China,,2020-12-26,94392.982458
...,...,...,...,...
37,Canada,,2021-01-28,514654.421119
38,Canada,,2021-01-29,514594.913146
39,Canada,,2021-01-30,514535.414994
40,Canada,,2021-01-31,514475.926661


In [79]:
out.to_csv("predictions.csv")
