# **Installation**

In [47]:
pip install statsmodels --upgrade

Requirement already up-to-date: statsmodels in /usr/local/lib/python3.6/dist-packages (0.12.0)


# **Preprocess**



In [57]:
import pandas as pd
import numpy as np
from pandas import read_csv
from datetime import datetime
import io
import os
import matplotlib.pyplot as plt

path = './'
split_csv_folder = path + 'split_csv_country/'

if not os.path.exists(split_csv_folder):
  os.makedirs(split_csv_folder)

In [58]:
def mape(actual, pred): 
  actual, pred = np.array(actual), np.array(pred)
  return np.mean(np.abs((actual - pred) / (actual + (actual==0)))) * 100

In [59]:
train = read_csv(path + 'download.csv')
train = train.drop(columns = ['countryterritoryCode', 'geoId'])

In [60]:
# load data
# import csv to dataframe and drop unneccessary columns
train.index.name = 'date'

# mark all NA values with 0
train['cases'].fillna(0, inplace=True)
train['deaths'].fillna(0, inplace=True)
train['popData2019'].fillna(0, inplace=True)
train['Cumulative_number_for_14_days_of_COVID-19_cases_per_100000'].fillna(0, inplace=True)

In [61]:
# save to file
train.to_csv(path + 'proprocessed_data.csv')

In [62]:
columnName = ['dateRep', 'day', 'month', 'year', 'cases', 'deaths', 'popData2019', 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000']
train = read_csv(path + 'proprocessed_data.csv')
for ctr, group in train.groupby(['countriesAndTerritories']):
  group.to_csv((split_csv_folder + ctr + ".csv"), columns = columnName, index = False) 

In [63]:
# preprocess of csv data
for file in os.listdir(split_csv_folder):
  if file.endswith(".csv"):
    df = pd.read_csv(split_csv_folder + file)
    for index, row in df.iterrows():
       if row['cases'] < 0:
         row['cases'] = -row['cases']
    df.to_csv(split_csv_folder + file, index = False)

# **Create new csv for saving result**

In [64]:
result_df = pd.DataFrame({'':['10/9', '10/10', '10/11', '10/12', '10/13', '10/14', '10/15', 'MAPE']})
for file in os.listdir(split_csv_folder):
  if (file == 'Russia.csv') or (file == 'Greece.csv') or (file == 'India.csv') or (file == 'United_States_of_America.csv') or (file == 'Turkey.csv'):
    result_df[file[:-4]] = ''
result_df.to_csv(path + 'result_regenerated.csv', index = False)

# **Autoregression**

In [65]:
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from math import sqrt
import numpy as np
model_para = {}

for file in os.listdir(split_csv_folder):
  if (file == 'Russia.csv') or (file == 'Greece.csv') or (file == 'India.csv') or (file == 'United_States_of_America.csv') or (file == 'Turkey.csv'):
    df = pd.read_csv(split_csv_folder + file, header=0, index_col=0)
    if (file == 'Cases_on_an_international_conveyance_Japan.csv'):
      for t in range(0, 7):
        result_df[file[:-4]].loc[t] = 0
    else:
      df = df.reindex(index = df.index[::-1])
      X = df['cases'].values
      X = X.astype('float32')
      min = 1000000
      opt_lag = -1
      train = X[: len(X) - 10]
      test = X[len(X) - 10:]
      
      try_range = 50
      if len(X[: len(X) - 10]) < 50:
        try_range = len(X[: len(X) - 10])

      for i in range(1, try_range):
        # train autoregression
        window = i
        model = AutoReg(train, lags=window, old_names=False)
        model_fit = model.fit()
        coef = model_fit.params
        # walk forward over time steps in test
        history = train[len(train)-window:]
        history = [history[idx] for idx in range(len(history))]
        predictions = list()
        for t in range(len(test)):
          length = len(history)
          lag = [history[idx] for idx in range(length-window,length)]
          yhat = coef[0]
          for d in range(window):
            yhat += coef[d+1] * lag[window-d-1]
          predictions.append(yhat)
          history.append(yhat)
        mp = mape(test, predictions)
        if mp < min:
          opt_lag = i
          min = mp
      model_para[file[:-4] + '_mape'] = min
      model_para[file[:-4] + '_lag'] = opt_lag
      train = X[: len(X) - 10]
      model = AutoReg(train, lags=opt_lag, old_names=False)
      model_fit = model.fit()
      coef = model_fit.params
      history = X[len(X)-opt_lag:]
      history = [history[i] for i in range(len(history))]
      # walk forward over time steps in test
      for t in range(0, 7):
        length = len(history)
        lag = [history[i] for i in range(length-opt_lag,length)]
        yhat = coef[0]
        for d in range(opt_lag):
          yhat += coef[d+1] * lag[opt_lag-d-1]
        history.append(yhat)
        if yhat < 0:
          yhat = 0
        result_df[file[:-4]].loc[t] = round(yhat)
      result_df[file[:-4]].loc[7] = min
      result_df.to_csv(path + 'result_regenerated.csv', index = False)
      # pyplot.plot(test[:7])
      # pyplot.plot(result_df[file[:-4]], color='red')
      # pyplot.show()