# Main file for classification models

In [None]:
# import libraries
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
from classification_helper_functions_ import DTW, ts_data, performance_table, convert, model_performance, train_test, train_test_split_classification, lstm, train_test_lstm, split_series, performance_table_lstm, plot_acutal_predict, naive_baseline
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima_model import ARIMA
import timeit
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

## RF Classification

In [None]:
# main function for RF classification
def run_rf(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None) # build the dataset
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    
                    # train the classifier 
                    clf=RandomForestClassifier(n_estimators=100)
                    clf.fit(x_train,y_train)
                    
                    for j in range(0,p): # make classifications p-steps out of sample
                        x_test_temp = [x_test.iloc[j]]
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + [list(y_test)[j]]
                    
                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                # calculate model performance and return results in a dataframe 
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('RF')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    return performance_table(MODEL_RESULTS)

## XGB

In [None]:
# main function for XGB
def run_xgb(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None) # build the dataset
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)): 
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    
                    # train the classifier 
                    clf=xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1)
                    clf=xgb.XGBClassifier()
                    clf.fit(np.array(x_train),y_train.ravel())
                    
                    for j in range(0,p): # make classifications p-steps out of sample
                        x_test_temp = np.array([x_test.iloc[j]])
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + [list(y_test)[j]]

                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                # calculate model performance and return results in a dataframe 
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('XGB')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    return performance_table(MODEL_RESULTS)

## Naive Baseline

In [None]:
# main function for naive baseline classifer
def run_naive_baseline (CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X = ts_data(country = c, category = 'total', frequency = f, model = 'naive_baseline') # build the dataset
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()
                
                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    train, test = train_test(X,train_s,p)
                    
                    for j in range(0,p): # make classifications p-steps out of sample
                        y_hat = naive_baseline(train, test.index[j])
                        t_ = t_ + list([y_hat])
                        
                        t = t + list([test.iloc[j]])
                    
                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                # calculate model performance and return results in a dataframe 
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('NAIVE BASELINE')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
                
    return performance_table(MODEL_RESULTS)

## LSTM

In [None]:
%%capture
def run_lstm (CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                             'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            if f in ['1H', '2H']: # we dont run LSTM on hourly or 2-hourly data due to long runtime
                return
            else:
                for p in PREDICT_HORIZON:
                    X = ts_data(country = c, category = 'total', frequency = f, model = 'naive_baseline') # build the dataset
                    n_input, n_nodes, n_epochs, n_activation, n_optimize = [14, 80, 100, 'relu', 'Adam']

                    lstm_x, lstm_y, lstm_dt = split_series(X.values, X.index, n_input)
                    train_s = int(len(lstm_x)*0.7) # change from x to lstm_x
                    test_s = len(lstm_x) - train_s # change from x to lstm_x
                    t = list()
                    t_ = list()
                    idx = list()

                    START_TIME = timeit.default_timer()

                    for i in range(0,int(test_s/p)):
                        train, test, datetime = train_test_lstm(lstm_x, lstm_y, lstm_dt, train_s, p)

                        # train model using the train set 
                        train = train.reshape((train.shape[0], train.shape[1], 1))
                        model = lstm(n_input, n_nodes, n_activation, n_optimize)
                        model.fit(train, lstm_y[0:train_s], epochs = n_epochs, verbose=0)

                        # make classifications p-steps out of sample
                        for j in range(0,p):
                            if (len(lstm_x)==train_s):
                                break

                            x_test_instance = lstm_x[train_s+j]
                            x_test_instance = x_test_instance.reshape((1,n_input,1))

                            t.append(test[j])
                            t_.append(int(model.predict(x_test_instance,verbose=0)))

                            idx.append(lstm_dt[train_s+j])

                        train_s += p

                    END_TIME = timeit.default_timer()
                    TIME = convert(END_TIME-START_TIME)

                    # calculate model performance and return results in a dataframe 
                    accuracy, f1score, precision, recall = model_performance(t_, t)

                    MODEL_RESULTS[c]['MODEL'].append('LSTM')
                    MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                    MODEL_RESULTS[c]['Frequency'].append(f)
                    MODEL_RESULTS[c]['Prediction_Window'].append(p)
                    MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                    MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                    MODEL_RESULTS[c]['PRECISION'].append(precision)
                    MODEL_RESULTS[c]['RECALL'].append(recall)
    
    return performance_table(MODEL_RESULTS)

## KNN

In [None]:
def run_knn(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()
                
                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    
                    # train the classifier 
                    clf = KNeighborsClassifier(metric=DTW, n_neighbors = 1)
                    clf.fit(np.array(x_train),y_train.ravel())
                    
                    for j in range(0,p): # make classifications p-steps out of sample
                        x_test_temp = np.array([x_test.iloc[j]])
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + [list(y_test)[j]]
                        
                    train_s += p
                    
                

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                # calculate model performance and return results in a dataframe 
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('KNN')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    return performance_table(MODEL_RESULTS)

### Run all models

In [None]:
%%capture
FREQ = ['1H', '2H', '24H']
PREDICT_HORIZON = [1, 6, 12, 24]
CONT = ['ENTIRE','NORTH_AMERICA']

# run each model using the entire (global) and North America dataset subject to different 
TABLE_KNN = run_knn(CONT, FREQ, PREDICT_HORIZON)
TABLE_RF = run_rf(CONT, FREQ, PREDICT_HORIZON)
TABLE_XBG = run_xgb(CONT, FREQ, PREDICT_HORIZON)
TABLE_NAIVE_BASELINE = run_naive_baseline(CONT, FREQ, PREDICT_HORIZON)
TABLE_LSTM = run_lstm(CONT, FREQ, PREDICT_HORIZON)

TABLES = [TABLE_RF, TABLE_XBG, TABLE_KNN, TABLE_NAIVE_BASELINE, TABLE_LSTM]
CLASSIFICATION_PERFORMANCE = pd.concat(TABLES)

CLASSIFICATION_PERFORMANCE.to_csv('Classification_Performance.csv', index = False)