In [1]:
import pandas as pd
import numpy as np
from numpy import array
import shap
import matplotlib.pyplot as plt
from util_visualization import plot_feature_importances, plot_roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_percentage_error, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve
from pylab import rcParams
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle
import importlib
import sys
import joblib
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import glob

#LSTM
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
tf.random.set_seed(7)

2024-03-26 14:08:32.107317: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Import Data

In [2]:
#Import Dataset
labeled_index = pd.read_csv("../data/labeled_data/quarterly_labeled_features_standardized.csv")
labeled_index.set_index(labeled_index['date'], inplace=True)
quarters = np.sort(labeled_index.index.unique())

#Features Columns
features_columns = list(labeled_index.columns)
columns_to_remove = ['index', 'crash_label', 'date', 'Quarter']
features_columns = [column for column in features_columns if column not in columns_to_remove]
print(len(features_columns))

28


# Build the model

In [12]:
def create_sequences(X_train_resampled):
    x = int(len(X_train_resampled))
    factors = []
    
    for i in range(1, x + 1):
        if x % i == 0:
            factors.append(i)
    k = int(len(factors)/2) - 1
    return factors[k], int(x/factors[k])

X_train, X_temp, y_train, y_temp = train_test_split(labeled_index[features_columns], labeled_index['crash_label'], test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Oversample the minority class (1) using SMOTE
oversampler = SMOTE()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
e = len(X_train_resampled)
x, y = create_sequences(X_train_resampled)
x1,y1 = create_sequences(X_test)
X_train_resampled = X_train_resampled.values
X_train_resampled = X_train_resampled.reshape(x, y, 28)

X_test = X_test.values
X_test = X_test.reshape(x1, y1, 28)

y_train_resampled = y_train_resampled.values
y_train_resampled = y_train_resampled.reshape(x, y, 1)

y_test = y_test.values
y_test = y_test.reshape(x1, y1, 1)

In [13]:
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(X_train_resampled.shape[1], X_train_resampled.shape[2])))

In [14]:
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=64, return_sequences=True))

In [15]:
model.add(Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [36]:
model.summary()

In [20]:
histroy = model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=x, verbose=2)
loss = model.evaluate(X_test, y_test)

Epoch 1/50
1/1 - 0s - 170ms/step - loss: 0.0625
Epoch 2/50
1/1 - 0s - 85ms/step - loss: 0.0623
Epoch 3/50
1/1 - 0s - 87ms/step - loss: 0.0620
Epoch 4/50
1/1 - 0s - 81ms/step - loss: 0.0618
Epoch 5/50
1/1 - 0s - 76ms/step - loss: 0.0616
Epoch 6/50
1/1 - 0s - 76ms/step - loss: 0.0613
Epoch 7/50
1/1 - 0s - 77ms/step - loss: 0.0611
Epoch 8/50
1/1 - 0s - 78ms/step - loss: 0.0609
Epoch 9/50
1/1 - 0s - 85ms/step - loss: 0.0606
Epoch 10/50
1/1 - 0s - 77ms/step - loss: 0.0604
Epoch 11/50
1/1 - 0s - 74ms/step - loss: 0.0602
Epoch 12/50
1/1 - 0s - 75ms/step - loss: 0.0599
Epoch 13/50
1/1 - 0s - 72ms/step - loss: 0.0597
Epoch 14/50
1/1 - 0s - 75ms/step - loss: 0.0595
Epoch 15/50
1/1 - 0s - 73ms/step - loss: 0.0592
Epoch 16/50
1/1 - 0s - 75ms/step - loss: 0.0590
Epoch 17/50
1/1 - 0s - 75ms/step - loss: 0.0588
Epoch 18/50
1/1 - 0s - 72ms/step - loss: 0.0586
Epoch 19/50
1/1 - 0s - 77ms/step - loss: 0.0583
Epoch 20/50
1/1 - 0s - 74ms/step - loss: 0.0581
Epoch 21/50
1/1 - 0s - 74ms/step - loss: 0.0579


In [26]:
import math

trainScore = model.evaluate(X_train_resampled, y_train_resampled, verbose=0)
print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, math.sqrt(trainScore)))
testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, math.sqrt(testScore)))

Train Score: 0.05 MSE (0.23 RMSE)
Test Score: 0.14 MSE (0.37 RMSE)


In [40]:
X1_test = X_test.reshape(x1*y1, 28)
y1_test = y_test.reshape(x1*y1, 1)

In [41]:
y_prob = model.predict(X1_test)
y_pred = np.where(y_prob > 0.5, 1, 0) # Adjust the threshold as needed
accuracy = accuracy_score(y1_test, y_pred)
rmse = np.sqrt(mean_squared_error(y1_test, y_pred))
mape = np.mean(np.abs((y1_test - y_pred) / y1_test)) * 100

conf_matrix = confusion_matrix(y1_test, y_pred, labels=[0, 1])

precision = precision_score(y1_test, y_pred)
recall = recall_score(y1_test, y_pred)
f1 = f1_score(y1_test, y_pred)
auc_roc = roc_auc_score(y1_test, y_prob)
fpr, tpr, thresholds = roc_curve(y1_test, y_prob)

ValueError: Exception encountered when calling Sequential.call().

[1mCannot take the length of shape with unknown rank.[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=<unknown>, dtype=float32)
  • training=False
  • mask=None

In [32]:
y_pred.shape

(11, 12, 1)

In [45]:
y1_test.shape

(132, 1)