# **Recurrent Neural Network (RNN) Model**

After testing different models for bankruptcy prediction, we decided to implement RNN because RNNs are adept at handling sequential data, allowing us to capture temporal dependencies in financial indicators that could be crucial for identifying patterns leading to bankruptcy.

With the RNN we were able to obtain an accuracy of 90% which is still lower than the baseline of 93%.

In [21]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix


from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau

from keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE

from keras.regularizers import l2

In [2]:
# Read data
from google.colab import drive
drive.mount('/content/drive/')
file_path = '/content/drive/MyDrive/[1] DSI Drive/[2] Projects/Project 5/data/american_bankruptcy.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive/


In [3]:
df['status_label'] = np.where(df['status_label'] == 'alive', 1, 0)

In [4]:
df['status_label'].value_counts()

1    73462
0     5220
Name: status_label, dtype: int64

In [5]:
df.head()

Unnamed: 0,company_name,status_label,year,X1,X2,X3,X4,X5,X6,X7,...,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_1,1,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,1,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,1,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,1,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,1,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [11]:
df.columns

Index(['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5',
       'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16',
       'X17', 'X18'],
      dtype='object')

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE

# Assuming 'df' is your DataFrame

# Sort the DataFrame by company_name and year
df_sorted = df.sort_values(by=['company_name', 'year'])

# Group the data by company_name and create sequences
sequences = []
targets = []
for company, group in df_sorted.groupby('company_name'):
    features = group.drop(columns=['company_name', 'status_label', 'year']).values
    target = group['status_label'].values[-1]  # Get the last status_label for each company
    sequences.append(features)
    targets.append(target)

# Pad sequences to have the same length
max_sequence_length = max(len(sequence) for sequence in sequences)
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float64', padding='post')

# Convert targets to a numpy array
targets = np.array(targets)

# Flatten the sequences for SMOTE
sequences_flattened = sequences_padded.reshape(sequences_padded.shape[0], -1)

# Split the data into training and test sets
X_train_flat, X_test_flat, y_train, y_test = train_test_split(sequences_flattened, targets, test_size=0.2, random_state=42)

# Use SMOTE to oversample the minority class in the training set
smote = SMOTE(random_state=42)
X_train_res_flat, y_train_res = smote.fit_resample(X_train_flat, y_train)

# Reshape the data back into sequences
X_train_res = X_train_res_flat.reshape(-1, max_sequence_length, sequences_padded.shape[2])
X_test = X_test_flat.reshape(-1, max_sequence_length, sequences_padded.shape[2])

# Scale the data
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res.reshape(-1, X_train_res.shape[-1])).reshape(X_train_res.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

# Build the RNN model
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train_res_scaled.shape[1], X_train_res_scaled.shape[2])),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train_res_scaled, y_train_res, validation_data=(X_test_scaled, y_test),
                    epochs=50, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Test accuracy: 0.8813


In [17]:
from keras.regularizers import l2

# Build the improved RNN model
model_improved = Sequential([
    LSTM(128, activation='relu', input_shape=(X_train_res_scaled.shape[1], X_train_res_scaled.shape[2]), return_sequences=True, kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    LSTM(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the improved model
model_improved.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the improved model
history_improved = model_improved.fit(X_train_res_scaled, y_train_res, validation_data=(X_test_scaled, y_test),
                                      epochs=50, batch_size=32, callbacks=[early_stopping])

# Evaluate the improved model
test_loss_improved, test_accuracy_improved = model_improved.evaluate(X_test_scaled, y_test)
print(f"Improved test accuracy: {test_accuracy_improved:.4f}")




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Improved test accuracy: 0.9025


In [20]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Build the improved RNN model
model_improved_2 = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train_res_scaled.shape[1], X_train_res_scaled.shape[2]), return_sequences=True),
    Dropout(0.3),
    BatchNormalization(),
    LSTM(32, activation='relu'),
    Dropout(0.3),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_improved_2.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history_improved_2 = model_improved_2.fit(X_train_res_scaled, y_train_res, validation_data=(X_test_scaled, y_test),
                                          epochs=50, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_accuracy = model_improved_2.evaluate(X_test_scaled, y_test)
print(f"Improved test accuracy: {test_accuracy:.4f}")




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Improved test accuracy: 0.9025
