# Main file for classification models

In [20]:
# Import libraries
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
from classification_helper_functions import ts_data, performance_table, convert, model_performance, train_test, train_test_split_classification, lstm, train_test_lstm, split_series, performance_table_lstm, naive_baseline
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima_model import ARIMA
import timeit
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
FREQ = ['24H']
PREDICT_HORIZON = [1, 6, 12, 24]
CONT = ['ENTIRE','NORTH_AMERICA']
X, dt, lb = ts_data(country = 'ENTIRE', category = 'total', frequency = '24H', model = None)

In [5]:
X.head(10)

Unnamed: 0,y_class,x1,x2,x3,year_2019,year_2020,month_1,month_2,month_3,month_4,...,date_22,date_23,date_24,date_25,date_26,date_27,date_28,date_29,date_30,date_31
0,0,2019,10,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2019,10,8,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2019,10,9,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2019,10,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2019,10,11,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,2019,10,12,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,2019,10,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,2019,10,14,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,2019,10,15,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,2019,10,16,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def run_rf(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    clf=RandomForestClassifier(n_estimators=100)
                    clf.fit(x_train,y_train)
                    
                    for j in range(0,p):
                        x_test_temp = [x_test.iloc[j]]
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + list([y_test[j]])
                    
                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('RF')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    return performance_table(MODEL_RESULTS)

In [4]:
def run_xgb(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    
                    #clf=xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1)
                    clf=xgb.XGBClassifier()
                    clf.fit(np.array(x_train),y_train.ravel())
                    
                    for j in range(0,p):
                        x_test_temp = np.array([x_test.iloc[j]])
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + list([y_test[j]])
                    
                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('XGB')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    return performance_table(MODEL_RESULTS)

In [5]:
def run_naive_baseline (CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X = ts_data(country = c, category = 'total', frequency = f, model = 'naive_baseline')
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()
                
                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    train, test = train_test(X,train_s,p)
                    
                    for j in range(0,p): 
                        y_hat = naive_baseline(train, test.index[j])
                        t_ = t_ + list([y_hat])
                        t = t + list([test.iloc[j]])
                    
                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('NAIVE BASELINE')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
                
    return performance_table(MODEL_RESULTS)

## LSTM

In [16]:
X = ts_data(country = 'ENTIRE', category = 'total', frequency = '24H', model = 'naive_baseline')

In [17]:
X.head(10)

Unnamed: 0_level_0,y_class
ds,Unnamed: 1_level_1
2019-10-07,0
2019-10-08,0
2019-10-09,0
2019-10-10,0
2019-10-11,0
2019-10-12,0
2019-10-13,0
2019-10-14,0
2019-10-15,0
2019-10-16,0


In [18]:
%%capture
def run_lstm (CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                             'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X = ts_data(country = c, category = 'total', frequency = f, model = 'naive_baseline')
                #X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                n_input, n_nodes, n_epochs, n_activation, n_optimize = [14, 80, 100, 'relu', 'Adam']

                lstm_x, lstm_y, lstm_dt = split_series(X.values, X.index, n_input)
                train_s = int(len(lstm_x)*0.7) ### change from x to lstm_x
                test_s = len(lstm_x) - train_s ### change from x to lstm_x
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()

                for i in range(0,int(test_s/p)):
                    train, test, datetime = train_test_lstm(lstm_x, lstm_y, lstm_dt, train_s, p)

                    # train model using training sample
                    train = train.reshape((train.shape[0], train.shape[1], 1))
                    model = lstm(n_input, n_nodes, n_activation, n_optimize)
                    model.fit(train, lstm_y[0:train_s], epochs = n_epochs, verbose=0)

                    # predict until completion of prediction horizon
                    for j in range(0,p):
                        if (len(lstm_x)==train_s):
                            break

                        x_test_instance = lstm_x[train_s+j]
                        x_test_instance = x_test_instance.reshape((1,n_input,1))

                        t.append(test[j])
                        t_.append(int(model.predict(x_test_instance,verbose=0)))

                        idx.append(lstm_dt[train_s+j])

                    train_s += p

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)

                accuracy, f1score, precision, recall = model_performance(t_, t)

                MODEL_RESULTS[c]['MODEL'].append('LSTM')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
    
    return performance_table(MODEL_RESULTS)

In [19]:
%%capture
FREQ = ['24H']
PREDICT_HORIZON = [1, 6, 12, 24]
CONT = ['ENTIRE','NORTH_AMERICA']
TABLE_LSTM = run_lstm(CONT, FREQ, PREDICT_HORIZON)
filename = 'Classification_Performance_LSTM_V02'+'.csv'
TABLE_LSTM.to_csv(filename, index = False)





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




## KNN

In [2]:
def DTW(a, b):   
    an = a.size
    bn = b.size
    pointwise_distance = distance.cdist(a.reshape(-1,1),b.reshape(-1,1))
    cumdist = np.matrix(np.ones((an+1,bn+1)) * np.inf)
    cumdist[0,0] = 0
    
    for ai in range(an):
        for bi in range(bn):
            minimum_cost = np.min([cumdist[ai, bi+1],
                                   cumdist[ai+1, bi],
                                   cumdist[ai, bi]])
            cumdist[ai+1, bi+1] = pointwise_distance[ai,bi] + minimum_cost

    return cumdist[an, bn]

In [7]:
def run_knn(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()
                
                print('c,f,p',c,f,p)
                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    clf = KNeighborsClassifier(n_neighbors = 1)
                    #clf = KNeighborsClassifier(metric=DTW, n_neighbors = 1)
                    clf.fit(np.array(x_train),y_train.ravel())
                    
                    for j in range(0,p):
                        x_test_temp = np.array([x_test.iloc[j]])
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + list([y_test[j]])
                    
                    train_s += p
                    
                

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('KNN')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
                print(MODEL_RESULTS)
    return performance_table(MODEL_RESULTS)

In [8]:
FREQ = ['1H','2H']
PREDICT_HORIZON = [1, 6, 12, 24, 72]
CONT = ['ENTIRE','NORTH_AMERICA']
TABLE_KNN = run_knn(CONT, FREQ, PREDICT_HORIZON)
TABLE_KNN.to_csv('Classification_Performance_TABLE_KNN_V01.csv', index = False)


c,f,p ENTIRE 1H 1
{'ENTIRE': {'MODEL': ['KNN'], 'RUN_TIME': ['0:01:13'], 'Frequency': ['1H'], 'Prediction_Window': [1], 'ACCURACY': [0.52], 'F1SCORE': [0.52], 'PRECISION': [0.52], 'RECALL': [0.52]}, 'NORTH_AMERICA': {'MODEL': [], 'RUN_TIME': [], 'Frequency': [], 'Prediction_Window': [], 'ACCURACY': [], 'F1SCORE': [], 'PRECISION': [], 'RECALL': []}}
c,f,p ENTIRE 1H 6
{'ENTIRE': {'MODEL': ['KNN', 'KNN'], 'RUN_TIME': ['0:01:13', '0:00:14'], 'Frequency': ['1H', '1H'], 'Prediction_Window': [1, 6], 'ACCURACY': [0.52, 0.48], 'F1SCORE': [0.52, 0.48], 'PRECISION': [0.52, 0.48], 'RECALL': [0.52, 0.48]}, 'NORTH_AMERICA': {'MODEL': [], 'RUN_TIME': [], 'Frequency': [], 'Prediction_Window': [], 'ACCURACY': [], 'F1SCORE': [], 'PRECISION': [], 'RECALL': []}}
c,f,p ENTIRE 1H 12
{'ENTIRE': {'MODEL': ['KNN', 'KNN', 'KNN'], 'RUN_TIME': ['0:01:13', '0:00:14', '0:00:11'], 'Frequency': ['1H', '1H', '1H'], 'Prediction_Window': [1, 6, 12], 'ACCURACY': [0.52, 0.48, 0.48], 'F1SCORE': [0.52, 0.48, 0.48], 'PRECISI

c,f,p NORTH_AMERICA 1H 24
{'ENTIRE': {'MODEL': ['KNN', 'KNN', 'KNN', 'KNN', 'KNN', 'KNN', 'KNN', 'KNN', 'KNN', 'KNN'], 'RUN_TIME': ['0:01:13', '0:00:14', '0:00:11', '0:00:05', '0:00:02', '0:00:21', '0:00:04', '0:00:02', '0:00:01', '0:00:01'], 'Frequency': ['1H', '1H', '1H', '1H', '1H', '2H', '2H', '2H', '2H', '2H'], 'Prediction_Window': [1, 6, 12, 24, 72, 1, 6, 12, 24, 72], 'ACCURACY': [0.52, 0.48, 0.48, 0.49, 0.42, 0.42, 0.37, 0.41, 0.4, 0.34], 'F1SCORE': [0.52, 0.48, 0.48, 0.5, 0.42, 0.42, 0.37, 0.41, 0.41, 0.35], 'PRECISION': [0.52, 0.48, 0.49, 0.5, 0.43, 0.42, 0.38, 0.42, 0.43, 0.36], 'RECALL': [0.52, 0.48, 0.48, 0.49, 0.42, 0.42, 0.37, 0.41, 0.4, 0.34]}, 'NORTH_AMERICA': {'MODEL': ['KNN', 'KNN', 'KNN', 'KNN'], 'RUN_TIME': ['0:01:15', '0:00:12', '0:00:06', '0:00:03'], 'Frequency': ['1H', '1H', '1H', '1H'], 'Prediction_Window': [1, 6, 12, 24], 'ACCURACY': [0.59, 0.58, 0.53, 0.56], 'F1SCORE': [0.58, 0.57, 0.51, 0.57], 'PRECISION': [0.58, 0.56, 0.5, 0.58], 'RECALL': [0.59, 0.58, 0.53,

## Logistic Regression

In [21]:
def run_lr(CONT, FREQ, PREDICT_HORIZON):
    MODEL_RESULTS = dict()
    for i in CONT:
        MODEL_RESULTS[i] = {'MODEL':[],'RUN_TIME':[], 'Frequency':[], 'Prediction_Window':[], 'ACCURACY':[], 
                         'F1SCORE':[], 'PRECISION':[], 'RECALL':[]}
    for c in CONT:
        for f in FREQ:
            for p in PREDICT_HORIZON:
                X, dt, lb = ts_data(country = c, category = 'total', frequency = f, model = None)
                train_s = int(len(X)*0.7)
                test_s = len(X) - train_s
                t = list()
                t_ = list()
                idx = list()

                START_TIME = timeit.default_timer()
                
                print('c,f,p',c,f,p)
                for i in range(0,int(test_s/p)):
                    x_train, x_test, y_train, y_test = train_test_split_classification(X, train_s, p)
                    clf = LogisticRegression()
                    clf.fit(np.array(x_train),y_train.ravel())
                    
                    for j in range(0,p):
                        x_test_temp = np.array([x_test.iloc[j]])
                        y_pred=clf.predict(x_test_temp)

                        t_ = t_ + list(y_pred)
                        t = t + list([y_test[j]])
                    
                    train_s += p
                    
                

                END_TIME = timeit.default_timer()
                TIME = convert(END_TIME-START_TIME)
                
                accuracy, f1score, precision, recall = model_performance(t_, t)
                
                MODEL_RESULTS[c]['MODEL'].append('LR')
                MODEL_RESULTS[c]['RUN_TIME'].append(TIME)
                MODEL_RESULTS[c]['Frequency'].append(f)
                MODEL_RESULTS[c]['Prediction_Window'].append(p)
                MODEL_RESULTS[c]['ACCURACY'].append(accuracy)
                MODEL_RESULTS[c]['F1SCORE'].append(f1score)
                MODEL_RESULTS[c]['PRECISION'].append(precision)
                MODEL_RESULTS[c]['RECALL'].append(recall)
                print(MODEL_RESULTS)
    return performance_table(MODEL_RESULTS)

In [22]:
FREQ = ['1H','2H']
PREDICT_HORIZON = [1, 6, 12, 24, 72]
CONT = ['ENTIRE','NORTH_AMERICA']
TABLE_KNN = run_lr(CONT, FREQ, PREDICT_HORIZON)
TABLE_KNN.to_csv('Classification_Performance_TABLE_lr_V01.csv', index = False)

c,f,p ENTIRE 1H 1
{'ENTIRE': {'MODEL': ['LR'], 'RUN_TIME': ['2:01:02'], 'Frequency': ['1H'], 'Prediction_Window': [1], 'ACCURACY': [0.47], 'F1SCORE': [0.57], 'PRECISION': [0.8], 'RECALL': [0.47]}, 'NORTH_AMERICA': {'MODEL': [], 'RUN_TIME': [], 'Frequency': [], 'Prediction_Window': [], 'ACCURACY': [], 'F1SCORE': [], 'PRECISION': [], 'RECALL': []}}
c,f,p ENTIRE 1H 6
{'ENTIRE': {'MODEL': ['LR', 'LR'], 'RUN_TIME': ['2:01:02', '0:20:00'], 'Frequency': ['1H', '1H'], 'Prediction_Window': [1, 6], 'ACCURACY': [0.47, 0.47], 'F1SCORE': [0.57, 0.57], 'PRECISION': [0.8, 0.81], 'RECALL': [0.47, 0.47]}, 'NORTH_AMERICA': {'MODEL': [], 'RUN_TIME': [], 'Frequency': [], 'Prediction_Window': [], 'ACCURACY': [], 'F1SCORE': [], 'PRECISION': [], 'RECALL': []}}
c,f,p ENTIRE 1H 12
{'ENTIRE': {'MODEL': ['LR', 'LR', 'LR'], 'RUN_TIME': ['2:01:02', '0:20:00', '0:09:54'], 'Frequency': ['1H', '1H', '1H'], 'Prediction_Window': [1, 6, 12], 'ACCURACY': [0.47, 0.47, 0.47], 'F1SCORE': [0.57, 0.57, 0.57], 'PRECISION': [0.

KeyboardInterrupt: 

### RUN MODELS

In [10]:
%%capture
FREQ = ['24H']
PREDICT_HORIZON = [1, 6, 12, 24]
CONT = ['ENTIRE','NORTH_AMERICA']

TABLE_RF = run_rf(CONT, FREQ, PREDICT_HORIZON)
TABLE_XBG = run_xgb(CONT, FREQ, PREDICT_HORIZON)

TABLES = [TABLE_RF, TABLE_XBG]
CLASSIFICATION_PERFORMANCE_24H = pd.concat(TABLES)

CLASSIFICATION_PERFORMANCE_24H.to_csv('Classification_Performance_24H_XGB_RF.csv', index = False)

In [None]:
%%capture
FREQ = ['1H','2H']
PREDICT_HORIZON = [1, 6, 12, 24, 72]
CONT = ['ENTIRE','NORTH_AMERICA']

TABLE_RF = run_rf(CONT, FREQ, PREDICT_HORIZON)
TABLE_XBG = run_xgb(CONT, FREQ, PREDICT_HORIZON)
TABLE_NAIVE_BASELINE = run_naive_baseline(CONT, FREQ, PREDICT_HORIZON)


#save output to csv
TABLES = [TABLE_RF, TABLE_XBG, TABLE_NAIVE_BASELINE]
CLASSIFICATION_PERFORMANCE = pd.concat(TABLES)

CLASSIFICATION_PERFORMANCE.to_csv('Classification_Performance.csv', index = False)