# ARIMA Forecasting model
---

In [1]:
import os
import numpy as np 
import pandas as pd

from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
from tqdm.notebook import tqdm

import warnings 
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/covid19-global-forecasting-week-3/train.csv
/kaggle/input/covid19-global-forecasting-week-3/submission.csv
/kaggle/input/covid19-global-forecasting-week-3/test.csv
/kaggle/input/arimaparams/arima_params
/kaggle/input/arimaparams3/arima_params
/kaggle/input/arimaparams2/arima_params
/kaggle/input/arimaparams4/arima_params


Read intput and test dataframes from Kaggle. 

In [2]:
trainDf = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-3/train.csv", parse_dates=['Date'])
testDf  = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-3/test.csv", parse_dates=['Date'])  

# get rid of nan
trainDf['Province_State'] = trainDf['Province_State'].fillna('')
testDf['Province_State']  = testDf['Province_State'].fillna('')

Create a nested dictionary of countries and states

In [3]:
countries = dict()
for c in trainDf['Country_Region'].unique():
    countryDf = trainDf[trainDf['Country_Region'] == c]
    countries[c] = countryDf['Province_State'].unique()

Loss functions used for evaluating the model while training

In [4]:
def RMSLE(pred, target):
    return np.sqrt(np.mean(np.power(np.log(pred + 1) - np.log(target + 1), 2)))

def MSE(pred, target):
    return np.mean(np.power(pred  - target, 2))


Gicen a range of (P,Q,D) parameters, get the configuration with the best score.

In [5]:
def get_best_model(dataset,forcast_days, p, d, q):
    maxScore, maxCfg = float("inf"), (0,0,0)
    
    for pParam in range(p):
        for dParam in range(d):
            for qParam in range(q):
                order = (pParam,dParam,qParam)
                try:
                    loss = evaluate_arima(dataset, forcast_days, order)
                    if loss < maxScore:
                        maxScore, maxCfg = loss, order
                except:
                    continue
                    
    print('Cfg %s MSE %.3f' % (maxCfg, maxScore))
    return maxCfg, maxScore

Evaluate a configuration of ARIMA model.

In [6]:
def evaluate_arima(data, predDays, arimaPdq):

    # prepare training dataset
    split = int(data.shape[0] * 0.9)
    trainDays, testDays = data[:split], data[split:]
   
    # make predictions
    history   = trainDays
    model     = ARIMA(trainDays, order=arimaPdq)
    modelFit  = model.fit(transparams = True, maxiter = 800)
    
    predictions = modelFit.forecast(steps=testDays.shape[0])[0]
    
    model     = ARIMA(data, order=arimaPdq)
    modelFit  = model.fit(transparams = True, maxiter = 800)
    
    if np.isnan(modelFit.forecast(steps = predDays)[0]).sum()>0:
        return float('inf')

    error = RMSLE(predictions, testDays) 
    return error

In [7]:
testDf['ConfirmedCases'] = 0
testDf['Fatalities']     = 0

Iterate each country and make predictions

In [8]:
def get_arima_params():
    saveDf = testDf.copy()
    for country in tqdm(countries):
        for state in countries[country]:

            print('Country : '+country,'State : '+ state)
            # get country/state data
            dataDf    = trainDf[(trainDf['Province_State'] == state)&(trainDf['Country_Region'] == country)]
            targetDf  = testDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]
            predDays  = targetDf.shape[0]

            dataDf = dataDf.sort_values(by='Date').reset_index(drop=True)


            if dataDf[dataDf['ConfirmedCases'] > 0].shape[0] > 0:
                dataDf = dataDf[dataDf['ConfirmedCases'] > 0]

            # confirmed cases
            history = dataDf['ConfirmedCases'].to_list()
            if len(history) == 1:
                history.append(history[0])

            # get best config   
            maxCfg, maxScore = get_best_model(np.array(history), predDays, 6, 6, 6)

            # create confirmed predictions
            model         = ARIMA(history, order=maxCfg)
            model_fit     = model.fit(maxiter = 900)
            predConfirmed = model_fit.forecast(steps=predDays)[0]
            
            # align dates
            finalPredConfirmed = dataDf[dataDf['Date'] > targetDf['Date'].min()]['ConfirmedCases']
            predConfirmed = np.concatenate([finalPredConfirmed, predConfirmed[:-finalPredConfirmed.shape[0]]])
            
            
            # fatalities
            history = dataDf['Fatalities'].to_list()
            if len(history) == 1:
                history.append(history[0])

            # get best config
            maxCfg, maxScore = get_best_model(np.array(history), predDays, 6, 6, 6)

            # make predictions
            model      = ARIMA(history, order=maxCfg)
            model_fit  = model.fit(maxiter = 900)
            predDeath  = model_fit.forecast(steps=predDays)[0]
            
            # align dates
            finalPredDeath = dataDf[dataDf['Date'] > targetDf['Date'].min()]['Fatalities']
            predDeath = np.concatenate([finalPredDeath, predDeath[:-finalPredDeath.shape[0]]])
            
            # save results
            tempDf = saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]
            tempDf['Fatalities']     = predDeath
            tempDf['ConfirmedCases'] = predConfirmed
            saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)] = tempDf
    return saveDf

Load arima params because its faster from computer

In [9]:
def parse_arima_params():
    saveDf = testDf.copy()
    params = []
    with open("/kaggle/input/arimaparams4/arima_params", 'r') as f:
        lines = f.readlines();
        for i in tqdm(range(0,len(lines)-1, 3)):
#              print(lines[i])
            param = []
            line = lines[i].strip().split(':')
            line2 = lines[i+1]
            line3 = lines[i+2]
            
            
            country = line[1][1:-7] # country
            state = line[2][1:]        # state
            cfgConf = [int(line2[5]), int(line2[8]), int(line2[11])] 
            cfgDead = [int(line3[5]), int(line3[8]), int(line3[11])] 
            
            dataDf    = trainDf[(trainDf['Province_State'] == state)&(trainDf['Country_Region'] == country)]
            targetDf  = testDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]
            predDays  = targetDf.shape[0]

            dataDf = dataDf.sort_values(by='Date').reset_index(drop=True)
            
            if dataDf[dataDf['ConfirmedCases'] > 0].shape[0] > 0:
                dataDf = dataDf[dataDf['ConfirmedCases'] > 0]
            
            # confirmed cases
            history = dataDf['ConfirmedCases'].to_list()
            if len(history) == 1:
                history.append(history[0])
            
            # create confirmed predictions
            model         = ARIMA(history, order=cfgConf)
            model_fit     = model.fit()
            predConfirmed = model_fit.forecast(steps=predDays)[0]
            
            # align dates
            finalPredConfirmed = dataDf[dataDf['Date'] > targetDf['Date'].min()]['ConfirmedCases']
            predConfirmed = np.concatenate([finalPredConfirmed, predConfirmed[:-finalPredConfirmed.shape[0]]])
            
            # fatalities
            history = dataDf['Fatalities'].to_list()
            if len(history) == 1:
                history.append(history[0])

            # make predictions
            model      = ARIMA(history, order=cfgDead)
            model_fit  = model.fit()
            predDeath  = model_fit.forecast(steps=predDays)[0]
            
            # align dates
            finalPredDeath = dataDf[dataDf['Date'] > targetDf['Date'].min()]['Fatalities']
            predDeath = np.concatenate([finalPredDeath, predDeath[:-finalPredDeath.shape[0]]])
            
            # save results
            tempDf = saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]
            tempDf['Fatalities']     = predDeath
            tempDf['ConfirmedCases'] = predConfirmed
            saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)] = tempDf
            
    return saveDf


In [10]:
saveDf = get_arima_params()

HBox(children=(FloatProgress(value=0.0, max=180.0), HTML(value='')))

Country : Afghanistan State : 
Cfg (3, 0, 0) MSE 0.066
Cfg (3, 2, 2) MSE 0.103
Country : Albania State : 
Cfg (1, 2, 0) MSE 0.011
Cfg (0, 1, 0) MSE 0.027
Country : Algeria State : 
Cfg (5, 2, 4) MSE 0.151
Cfg (0, 2, 0) MSE 0.476
Country : Andorra State : 
Cfg (0, 1, 3) MSE 0.027
Cfg (1, 2, 0) MSE 0.095
Country : Angola State : 
Cfg (3, 0, 0) MSE 0.002
Cfg (0, 2, 0) MSE 0.000
Country : Antigua and Barbuda State : 
Cfg (0, 2, 2) MSE 0.266
Cfg (0, 0, 0) MSE 0.000
Country : Argentina State : 
Cfg (5, 2, 0) MSE 0.055
Cfg (0, 2, 2) MSE 0.051
Country : Armenia State : 
Cfg (0, 2, 0) MSE 0.030
Cfg (0, 2, 3) MSE 0.178
Country : Australia State : Australian Capital Territory
Cfg (5, 1, 0) MSE 0.004
Cfg (0, 2, 0) MSE 0.000
Country : Australia State : New South Wales
Cfg (2, 1, 0) MSE 0.032
Cfg (5, 2, 2) MSE 0.076
Country : Australia State : Northern Territory
Cfg (0, 2, 1) MSE 0.018
Cfg (0, 0, 0) MSE 0.000
Country : Australia State : Queensland
Cfg (1, 1, 4) MSE 0.019
Cfg (3, 2, 0) MSE 0.182
Coun

Save prediction

In [11]:
output=pd.DataFrame()
output['ForecastId']     = saveDf['ForecastId'].astype(np.int32)
output['ConfirmedCases'] = saveDf['ConfirmedCases'].astype(np.int32)
output['Fatalities']     = saveDf['Fatalities'].astype(np.int32)
output.to_csv('submission.csv', index=False)
output

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,110,4
1,2,110,4
2,3,120,4
3,4,170,4
4,5,174,4
...,...,...,...
13153,13154,26,1
13154,13155,27,1
13155,13156,27,1
13156,13157,28,1
