In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from util_visualization import plot_feature_importances, plot_roc_curve
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_percentage_error, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve
from pylab import rcParams
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle
import importlib
import sys
import joblib
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import glob
import math

#For NN
from keras.models import Sequential
from keras import backend as K
import tensorflow_addons as tfa
from keras.layers import Dense, LSTM, BatchNormalization, RepeatVector, TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2
from time import time
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [188]:
#Import Dataset
labeled_index = pd.read_csv("../data/labeled_data/quarterly_labeled_features_standardized.csv")
labeled_index.set_index(labeled_index['date'], inplace=True)
quarters = np.sort(labeled_index.index.unique())

#Features Columns
features_columns = list(labeled_index.columns)
columns_to_remove = ['index', 'crash_label', 'date', 'Quarter']
features_columns = [column for column in features_columns if column not in columns_to_remove]
print(features_columns)

['volatility', 'price_change', 'volume_change', '000001.SS', '^BSESN', '^BVSP', '^FCHI', '^FTLC', '^GDAXI', '^GSPC', '^HSI', '^N225', '^SSMI', 'Crude_Oil_Index_Excess_Return', 'Ted_Rate', 'Gold_Price', 'housing', 'reserve', '10YGov_BondYield', 'current_acct', 'FX_Rate', 'turnover', 'Population', 'npl', 'Recession_Indicators', 'inflation', 'Unemployment', 'GDP']


## Split data

In [189]:
train_index = math.ceil(len(quarters) * .7)
val_index = math.ceil(len(quarters) * .85)
train_quarters, val_quarters, test_quarters = quarters[:train_index], quarters[train_index:val_index], quarters[val_index:]
train = labeled_index.loc[train_quarters]
val = labeled_index.loc[val_quarters]
test = labeled_index.loc[test_quarters]
X_train = train[features_columns]
y_train = train['crash_label']
X_val = val[features_columns]
y_val = val['crash_label']
X_test = test[features_columns]
y_test = test['crash_label']

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(620, 28) (620,) (130, 28) (130,) (130, 28) (130,)


## Oversampling

In [192]:
# Oversample the minority class (1) using SMOTE
oversampler = SMOTE()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
print(X_train_resampled.shape, y_train_resampled.shape)

(1074, 28) (1074,)


## Reshape

In [193]:
""" Changes data to the format for LSTM training 
for sliding window approach 
Goal: capture temporal dependencies in the data """
# Prepare the list for the transformed data

def lstm_transform(X_data, y_data, num_steps=5):
    X, y = list(), list()
    # Loop of the entire data set
    for i in range(X_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps
        # if index is larger than the size of the dataset, we stop
        if end_ix >= X_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = np.array(X_data)[i:end_ix]
        # Get only the last element of the sequency for y
        seq_y = np.array(y_data)[end_ix]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_transformed = np.array(X)
    y_transformed = np.array(y)
    return x_transformed, y_transformed

In [194]:
X_train_reshaped, y_train_reshaped = lstm_transform(X_train_resampled, y_train_resampled)

X_val_reshaped, y_val_reshaped = lstm_transform(X_val, y_val)

X_test_reshaped, y_test_reshaped = lstm_transform(X_test, y_test)

In [195]:
print(X_train_reshaped.shape, y_train_reshaped.shape, X_val_reshaped.shape, y_val_reshaped.shape, X_test_reshaped.shape, y_test_reshaped.shape)

(1069, 5, 28) (1069,) (125, 5, 28) (125,) (125, 5, 28) (125,)


## Modelling

In [219]:
# Many-to-One Training
# X_train_reshaped[1]: number of time steps
# X_train_reshaped[2]: number of features

model = Sequential()
model.add(LSTM(64, dropout=0.1, input_shape=(X_train_reshaped.shape[1],X_train_reshaped.shape[2])))
# model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])

In [220]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 64)                23808     
                                                                 
 dense_14 (Dense)            (None, 1)                 65        
                                                                 
Total params: 23,873
Trainable params: 23,873
Non-trainable params: 0
_________________________________________________________________


In [221]:
model_result = model.fit(X_train_reshaped, y_train_reshaped, batch_size=32, epochs=50, validation_data=(X_val_reshaped, y_val_reshaped))
loss, auc = model.evaluate(X_test_reshaped, y_test_reshaped)

Epoch 1/50


2024-03-27 01:06:31.263920: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-03-27 01:06:31.463738: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x32x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_of



loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x13x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x13x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceSh

Epoch 2/50

loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x29x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/0032d1ee-80fd-11ee-8227-6aecfccc70fe/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x29x1x1xi1>'


Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [222]:
y_prob = model.predict(X_test_reshaped)
y_pred = np.where(y_prob > 0.5, 1, 0) # Adjust the threshold as needed
accuracy = accuracy_score(y_test_reshaped.flatten(), y_pred.flatten())
rmse = np.sqrt(mean_squared_error(y_test_reshaped.flatten(), y_pred.flatten()))
mape = np.mean(np.abs((y_test_reshaped.flatten() - y_pred.flatten()) / y_test_reshaped.flatten())) * 100

conf_matrix = confusion_matrix(y_test_reshaped.flatten(), y_pred.flatten(), labels=[0, 1])

precision = precision_score(y_test_reshaped.flatten(), y_pred.flatten())
recall = recall_score(y_test_reshaped.flatten(), y_pred.flatten())
f1 = f1_score(y_test_reshaped.flatten(), y_pred.flatten())
fpr, tpr, thresholds = roc_curve(y_test_reshaped.flatten(), y_prob.flatten())



2024-03-27 01:07:46.041260: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-03-27 01:07:46.084531: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




## Evaluation

In [226]:
evaluation_result = {
    'Train_Start': pd.to_datetime(train_quarters).min(),
    'Train_End': pd.to_datetime(train_quarters).max(),
    'Val_Start': pd.to_datetime(val_quarters).min(),
    'Val_End': pd.to_datetime(val_quarters).max(),
    'Test_Start': pd.to_datetime(test_quarters).min(),
    'Test_End': pd.to_datetime(test_quarters).max(),
    'Confusion_Matrix': conf_matrix, #[[TN, FP],[FN,TP]]
    'Precision': precision, 
    'Recall': recall, 
    'F1': f1, 
    'Accuracy': accuracy, 
    'RMSE': rmse,
    'MAPE': mape,
    'AUC-ROC': auc,
    'False Positive Rate': fpr,
    'True Positive Rate': tpr,
    'loss': loss
    }
evaluation_result

{'Train_Start': Timestamp('1998-03-31 00:00:00'),
 'Train_End': Timestamp('2013-06-30 00:00:00'),
 'Val_Start': Timestamp('2013-09-30 00:00:00'),
 'Val_End': Timestamp('2016-09-30 00:00:00'),
 'Test_Start': Timestamp('2016-12-31 00:00:00'),
 'Test_End': Timestamp('2019-12-31 00:00:00'),
 'Confusion_Matrix': array([[95, 29],
        [ 0,  1]]),
 'Precision': 0.03333333333333333,
 'Recall': 1.0,
 'F1': 0.06451612903225806,
 'Accuracy': 0.768,
 'RMSE': 0.48166378315169184,
 'MAPE': nan,
 'AUC-ROC': 0.9677419066429138,
 'False Positive Rate': array([0.        , 0.00806452, 0.0483871 , 0.0483871 , 1.        ]),
 'True Positive Rate': array([0., 0., 0., 1., 1.]),
 'loss': 0.9457312226295471}

TODO: https://stackoverflow.com/questions/77032200/how-can-i-correct-my-time-series-lstm-rnn-for-binary-classification-favoring-cla