# 📚 Libraries & Data Structure

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, confusion_matrix, mean_squared_error, accuracy_score, make_scorer, f1_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report
import joblib
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight

In [None]:
loan_export = pd.read_csv('LoanExport.csv', low_memory=False)

In [None]:
loan_export.columns

In [None]:
loan_export.sample(7)

In [None]:
loan_export.info()

In [None]:
numerical_describtion, categorical_describtion = loan_export.describe(exclude=[object]), loan_export.describe(exclude=[np.number])

In [None]:
def my_describe(df):
    border = "#" * 30
    columns = df.columns
    measures = df.index
    for column in columns:
        print(f"{border} {column} {border}")
        for measure in measures:
            print(f"{measure} for {column} column is: {df.loc[measure, column]}")


In [None]:
my_describe(numerical_describtion)

In [None]:
my_describe(categorical_describtion)

In [None]:
print("duplicated data: {}\n".format(loan_export.duplicated().sum()))

In [None]:
print("missing data: \n{}".format(loan_export.isna().sum()))

# 🔍 EDA

## 1. univariate

### 1. Categorical plots

In [None]:
categorical, numerical = loan_export[categorical_describtion.columns], loan_export[numerical_describtion.columns]

In [None]:
def create_bar_plots(df, max_unique_values=21):
    for column in df.columns:
        unique_values = df[column].nunique()
        if unique_values > max_unique_values:
            print(f"Skipping '{column}' column due to high unique values ({unique_values})")
            continue
        title='Bar Plot for'
        plt.figure(figsize=(8, 6))
        ax = df[column].value_counts().plot(kind='bar', color='skyblue')
        ax.set_title(f'{title} {column}')
        ax.set_xlabel(column)
        ax.set_ylabel("Count")
        plt.tight_layout()
        plt.show()

In [None]:
create_bar_plots(categorical)

In [None]:
def create_target_dist_bar_plots(data_frame, target_label='EverDelinquent', max_unique_values=21):
    target_values = data_frame[target_label].unique()
    categorical_columns = [col for col in data_frame.columns if data_frame[col].dtype == 'object' and col != target_label]
    
    for column in categorical_columns:
        unique_values = data_frame[column].nunique()
        if unique_values > max_unique_values:
            print(f"Skipping '{column}' column due to high unique values ({unique_values})")
            continue
        
        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        for value in target_values:
            subset = data_frame[data_frame[target_label] == value]
            value_counts = subset[column].value_counts()
            value_counts.sort_index(inplace=True)  # Sort by index to ensure consistent order
            value_counts.plot(kind='bar', color='blue' if value == 0 else 'red', ax=ax)
        
        ax.set_title(f'Distribution of {column} by {target_label}')
        ax.set_xlabel(column)
        ax.set_ylabel('Count')
        ax.legend(target_values, title=target_label)
        plt.tight_layout()
        plt.show()

In [None]:
create_target_dist_bar_plots(loan_export)

### 2. Numerical plots

In [None]:
numerical_describtion.columns

In [None]:
def create_numerical_plots(data_frame):
    numerical_columns = [col for col in data_frame.columns if pd.api.types.is_numeric_dtype(data_frame[col])]
    
    for column in numerical_columns:
        plt.figure(figsize=(10, 6))
        
        plt.subplot(1, 3, 1)
        sns.histplot(data_frame[column], bins=20, color='skyblue')
        plt.title(f'Histogram of {column}')
        
        plt.subplot(1, 3, 2)
        sns.boxplot(data_frame[column], palette='pastel')
        plt.title(f'Box Plot of {column}')
        
        plt.subplot(1, 3, 3)
        sns.scatterplot(data_frame[column])
        plt.tight_layout()
        plt.show()



In [None]:
create_numerical_plots(loan_export)

In [None]:
def plot_correlation(data_frame):
    corr_matrix = data_frame.corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()

In [None]:
plot_correlation(numerical)

In [None]:
def create_numerical_plots_vs_target(data_frame, target_label='EverDelinquent'):
    numerical_columns = [col for col in data_frame.columns if pd.api.types.is_numeric_dtype(data_frame[col])]
    
    for column in numerical_columns:
        plt.figure(figsize=(18, 6))
        
        plt.subplot(1, 3, 1)
        sns.histplot(data_frame[column], bins=20, color='skyblue')
        plt.title(f'Histogram of {column}')
        
        plt.subplot(1, 3, 2)
        sns.boxplot(x=data_frame[target_label], y=data_frame[column], palette='pastel')
        plt.title(f'Box Plot of {column} by {target_label}')
        
        plt.subplot(1, 3, 3)
        sns.scatterplot(data=data_frame, x=column, y=target_label, hue=target_label, palette='coolwarm')
        plt.title(f'Scatter Plot of {column} vs. {target_label}')
        
        plt.tight_layout()
        plt.show()

In [None]:
create_numerical_plots_vs_target(loan_export)

# 🧑‍💻 Preprocessing

### 1. Drop Irrelivvant Columns

* Drop irrelevant columns:
    - The dataset used contains information that is unavailable at the time of loan application. We will drop these columns before starting with our analysis. The columns we will drop are: `FirstPaymentDate` `MaturityDate` `MIP` `OrigUPB` `OrigInterestRate` `PPM` 
    - The column that i decide to drop it:
        - `LTV:` the correlation between it and OCLTV so must drop one of them
        - `SellerName:` there is 24994 NAN values from 291451
        - `OrigLoanTerm` there are positive and negative correlations between it and another two column
        - `PropertyState`as this information is encoded in the MSA column.
        - `LoanSeqNum` unique id assigned to each loan. As it provides no information we will drop this column.
        - `ServicerName` dependent loan activity and since this information is not available at the time of loan request we will drop these columns.
        - `ProductType` this column has the same value for all rows, it provides no useful information to us.

In [None]:
loan_export_copy = loan_export.copy()

In [None]:
def drop_columns(df):
    coulmns = [
        'LTV', 'SellerName', 'OrigLoanTerm', 'FirstPaymentDate',
        'MaturityDate', 'MIP', 'OrigUPB', 'OrigInterestRate',
        'PPM','PropertyState', 'LoanSeqNum', 'ServicerName',
        'ProductType',
    ]
    df.drop(coulmns, axis=1, inplace=True)
    
    return df

In [None]:
loan_export_copy = drop_columns(loan_export_copy)

### 2. Missing Values

In [None]:
categorical.FirstTimeHomebuyer.value_counts()

In [None]:
# replace X with NA
a = categorical['FirstTimeHomebuyer'].values
categorical['FirstTimeHomebuyer'] = np.where(a == 'X', np.NaN, a)

In [None]:
categorical.FirstTimeHomebuyer.value_counts()

In [None]:
categorical.FirstTimeHomebuyer.isna().sum()

In [None]:
categorical.MSA.value_counts()

In [None]:
categorical['MSA'] = categorical['MSA'].replace('X    ', 'X')

In [None]:
# replace X with NA
a = categorical['MSA'].values
categorical['MSA'] = np.where(a == 'X', np.NaN, a)

In [None]:
categorical['MSA'].value_counts()

In [None]:
def replace_X_with_NAN(df):
    columns = df.select_dtypes(include=['object']).columns
    for column in columns:
        df.loc[df[column].str.strip() == 'X', column] = np.nan
    return df

In [None]:
categorical = replace_X_with_NAN(categorical)

In [None]:
categorical['MSA'].value_counts()

In [None]:
def print_value_counts(data_frame):
    for column in data_frame.columns:
        print(f"Column: {column}")
        print(data_frame[column].value_counts())
        print("#" * 50)

In [None]:
print_value_counts(categorical)

In [None]:
categorical.isna().sum()

In [None]:
numerical.isna().sum()

In [None]:
loan_export_copy = replace_X_with_NAN(loan_export_copy)

In [None]:
loan_export_copy.isna().sum()

In [None]:
columns_with_na = ['MSA', 'FirstTimeHomebuyer', 'PropertyType', 'NumBorrowers', 'PostalCode']

In [None]:
print_value_counts(loan_export_copy[columns_with_na])

In [None]:
create_bar_plots(loan_export_copy[columns_with_na])

In [None]:
def preprocess_loan_export(data_frame):
    data_frame['FirstTimeHomebuyer'].fillna('N', inplace=True)
    data_frame['PropertyType'].fillna('Not Available', inplace=True)
    data_frame['NumBorrowers'].fillna('Not Available', inplace=True)
    data_frame['MSA'].fillna(0, inplace=True)
    data_frame.dropna(subset=['PostalCode'], inplace=True)
    return data_frame

In [None]:
loan_export_copy = preprocess_loan_export(loan_export_copy)

In [None]:
loan_export_copy.info()

In [None]:
loan_export_copy.isna().sum()

### 3. Dealing With Categorical

In [None]:
cat = loan_export_copy.select_dtypes(include=['object'])

In [None]:
cat

In [None]:
to_integer = 'MSA', 'PostalCode', 'NumBorrowers'

In [None]:
def convert_columns_to_integer(data_frame, columns):
    for column in columns:
        data_frame[column] = pd.to_numeric(data_frame[column], errors='coerce', downcast='integer')
    return data_frame

In [None]:
loan_export_copy = convert_columns_to_integer(loan_export_copy, to_integer)

In [None]:
loan_export_copy.info()

In [None]:
one_hot_encoding = ['FirstTimeHomebuyer', 'Occupancy', 'Channel', 'PropertyType', 'LoanPurpose', 'NumBorrowers']

In [None]:
def one_hot_encode_columns(data_frame, columns):
    encoded_df = pd.get_dummies(data_frame, columns=columns)
    return encoded_df

In [None]:
loan_export_copy = one_hot_encode_columns(loan_export_copy, one_hot_encoding)

In [None]:
loan_export_copy.info()

### 4. Preparing Data for Models

In [None]:
features, y = loan_export_copy.drop(['EverDelinquent'], axis=1), loan_export_copy.EverDelinquent.values

In [None]:
scaler = StandardScaler()
features = scaler.fit_transform(features)

# 🤖 Modeling

In [None]:
# Helper Functions
def plot_roc_curve(y_true, y_pred_probs):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_probs)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

def plot_learning_curve(model, X, y, cv, check_points):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=cv, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, check_points)
    )
    
    train_errors = -train_scores.mean(axis=1)
    test_errors = -test_scores.mean(axis=1)
    
    plt.figure()
    plt.plot(train_sizes, train_errors, 'o-', color='r', label='Training error')
    plt.plot(train_sizes, test_errors, 'o-', color='g', label='Validation error')
    plt.xlabel('Training Examples')
    plt.ylabel('Mean Squared Error')
    plt.title('Learning Curve')
    plt.legend(loc='best')
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    classes = ['Class 0', 'Class 1']  # Replace with class labels
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
def plot_train_val_tradeoff(y_train_true, y_train_pred, y_val_true, y_val_pred):
    train_errors = mean_squared_error(y_train_true, y_train_pred)
    val_errors = mean_squared_error(y_val_true, y_val_pred)
    
    plt.figure(figsize=(10, 5))
    
    # Plotting Mean Squared Error (MSE)
    plt.subplot(1, 2, 1)
    plt.plot(train_errors, label='Training Error', marker='o')
    plt.plot(val_errors, label='Validation Error', marker='o')
    plt.xlabel('Iterations')
    plt.ylabel('Mean Squared Error')
    plt.title('Training vs. Validation Error')
    plt.legend()
    
    # Plotting Loss (using the same errors as losses for illustration purposes)
    plt.subplot(1, 2, 2)
    plt.plot(train_errors, label='Training Loss', marker='o')
    plt.plot(val_errors, label='Validation Loss', marker='o')
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.title('Training vs. Validation Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()


def save_model_to_h5(model, filename):
    joblib.dump(model, filename)


### 1. Logistic Regression

##### a. Split data and make baseline

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(features, y, test_size=0.3, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True)

In [None]:
LR_model = LogisticRegression(penalty='none', max_iter=1000, verbose=1)  # No regularization
LR_model.fit(X_train, y_train)

In [None]:
# evaluate on validation set
y_val_pred = LR_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# evaluate on test set
y_test_pred = LR_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

##### b. Display the baseline performance

In [None]:
y_train_pred_probs = LR_model.predict_proba(X_train)[:, 1]
y_val_pred_probs = LR_model.predict_proba(X_val)[:, 1]
y_train_pred = LR_model.predict(X_train)
y_val_pred = LR_model.predict(X_val)

In [None]:
plot_roc_curve(y_train, y_train_pred_probs)

In [None]:
plot_learning_curve(LR_model, X_train, y_train, 5, 10)

In [None]:
plot_confusion_matrix(y_train, y_train_pred)

In [None]:
plot_confusion_matrix(y_val, y_val_pred)

##### c. Search for the best fit line & Regularized the model

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
}
LR_model = LogisticRegression(max_iter=1000, verbose=1)
grid_search_LR = GridSearchCV(LR_model, param_grid, cv=5, scoring='accuracy')
grid_search_LR.fit(X_train, y_train)

In [None]:
LR_best_model = grid_search_LR.best_estimator_
y_val_pred = LR_best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Best Hyperparameters:", grid_search_LR.best_params_)

In [None]:
y_test_pred = LR_best_model.predict(X_test)
test_accuracy_LR = accuracy_score(y_test, y_test_pred)
classification_rep = classification_report(y_test, y_test_pred)

print("Test Accuracy:", test_accuracy)
print("Classification Report:\n", classification_rep)

##### d. Select the final model

In [None]:
final_model_LR = grid_search_LR.best_estimator_

In [None]:
# save the model
save_model_to_h5(final_model_LR, 'final_logistic_model.h5')

### 2. Support Vector Machine

##### a. Split data and make baseline

In [None]:
SVC_model = SVC(verbose=2, probability=True, class_weight='balanced')
SVC_model.fit(X_train[:20000], y_train[:20000])

y_train_pred = SVC_model.predict(X_train[:20000])
y_val_pred = SVC_model.predict(X_val)

In [None]:
train_accuracy = accuracy_score(y_train[:20000], y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)

##### b. Display the baseline performance

In [None]:
y_train_pred_probs = SVC_model.predict_proba(X_train[:20000])[:, 1]
y_val_pred_probs = SVC_model.predict_proba(X_val)[:, 1]
y_train_pred = SVC_model.predict(X_train[:20000])
y_val_pred = SVC_model.predict(X_val)

In [None]:
plot_roc_curve(y_train[:20000], y_train_pred_probs)

In [None]:
plot_learning_curve(SVC_model, X_train[:20000], y_train[:20000], 3, 5)

In [None]:
plot_confusion_matrix(y_train[:20000], y_train_pred)

In [None]:
plot_confusion_matrix(y_val, y_val_pred)

As we see the prediction skewed to one of them, and we should set weights to each class because the classes are imbalanced

##### c. Search for the best fit line & Regularized the model

In [None]:
pca = PCA()
pca.fit(X_val)
# Plot the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance_ratio = np.cumsum(explained_variance_ratio)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(cumulative_explained_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio vs. Number of Components')
plt.grid()
plt.show()

In [None]:
# n_components = 22 # Choose the number of principal components
# pca = PCA(n_components=n_components)
# X_pca = pca.fit_transform(X_val)

In [None]:
param_grid = {
    'C': [0.1, 10],
    'kernel': ['poly', 'rbf'],
}
f1_scorer = make_scorer(f1_score)

In [None]:
grid_search_SVC = GridSearchCV(
    SVC_model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=3, 
    verbose=2,
)
grid_search_SVC.fit(X_val, y_val)

In [None]:
SVC_best_model = grid_search_SVC.best_estimator_
y_train_pred = SVC_best_model.predict(X_train[20000:40000])
val_accuracy = accuracy_score(y_train[20000:40000], y_train_pred)
print("Validation Accuracy:", val_accuracy)
print("Best Hyperparameters:", grid_search_SVC.best_params_)

In [None]:
y_test_pred = SVC_best_model.predict(X_test)
test_accuracy_SVC = accuracy_score(y_test, y_test_pred)
classification_rep = classification_report(y_test, y_test_pred)

print("Test Accuracy:", test_accuracy)
print("Classification Report:\n", classification_rep)

##### d. Select the final model

In [None]:
final_model_SVC = grid_search_SVC.best_estimator_

In [None]:
# save the model
save_model_to_h5(final_model_SVC, 'final_SVC_model.h5')

### 3. Gaussian Discriminant Analysis

##### a. Split data and make baseline

In [None]:
GDA_model = LinearDiscriminantAnalysis()
GDA_model.fit(X_train, y_train)
y_train_pred = GDA_model.predict(X_train)
y_val_pred = GDA_model.predict(X_val)

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)

##### b. Display the baseline performance

In [None]:
# Assuming you have a trained logistic regression model 'model'
y_train_pred_probs = GDA_model.predict_proba(X_train)[:, 1]
y_val_pred_probs = GDA_model.predict_proba(X_val)[:, 1]
y_train_pred = GDA_model.predict(X_train)
y_val_pred = GDA_model.predict(X_val)

In [None]:
plot_roc_curve(y_train, y_train_pred_probs)

In [None]:
plot_learning_curve(GDA_model, X_train, y_train, 5, 30)

In [None]:
plot_confusion_matrix(y_train, y_train_pred)

In [None]:
plot_confusion_matrix(y_val, y_val_pred)

##### c. Search for the best fit line & Regularized the model

In [None]:
# Apply random under-sampling to balance the class distribution
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [None]:
param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto', 0.1, 0.5, 0.9],
}
grid_search_GDA = GridSearchCV(
    GDA_model,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
)
grid_search_GDA.fit(X_resampled, y_resampled)

In [None]:
GDA_best_model = grid_search_GDA.best_estimator_
y_val_pred = GDA_best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Best Hyperparameters:", grid_search_GDA.best_params_)

In [None]:
y_test_pred = GDA_best_model.predict(X_test)
test_accuracy_GDA = accuracy_score(y_test, y_test_pred)
classification_rep = classification_report(y_test, y_test_pred)

print("Test Accuracy:", test_accuracy)
print("Classification Report:\n", classification_rep)

##### d. Select the final model

In [None]:
final_model_GDA = grid_search_GDA.best_estimator_

In [None]:
save_model_to_h5(final_model_GDA, 'final_GDA_model.h5')

### 4. Fead-Forward Neural Network

##### a. Split data and make baseline

In [None]:
n_features = 29
NN_model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(n_features,)),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
NN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = NN_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

In [None]:
history.history

In [None]:
# evaluate on validation set
print("Validation Accuracy:", np.mean(history.history['val_accuracy']))

# evaluate on test set
loss, acc = NN_model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", acc)

##### b. Display the baseline performance

In [None]:
y_train_pred = NN_model.predict_classes(X_train)
y_val_pred = NN_model.predict_classes(X_val)

In [None]:
y_pred_proba = NN_model.predict_proba(X_train)[:, 0]
plot_roc_curve(y_train, y_pred_proba)

In [None]:
plot_confusion_matrix(y_train, y_train_pred)

In [None]:
plot_confusion_matrix(y_val, y_val_pred)

##### c. Search for the best fit line & Regularized the model

In [None]:
n_features = 29

# Calculate class weights manually
class_counts = np.bincount(y_train)
total_samples = class_counts.sum()
class_weights = {cls: total_samples / count for cls, count in enumerate(class_counts)}

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
NN_model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(n_features,), kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
NN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = NN_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val),
                       class_weight=class_weights, callbacks=[early_stopping])


In [None]:
NN_model.summary()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.plot(history.history['accuracy'], label='Train acc')
plt.plot(history.history['val_accuracy'], label='Validation acc')
plt.title('Training and Validation Loss & Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss & Accuracy')
plt.legend()
plt.show()

In [None]:
# Evaluate the model on the test set
loss, accuracy = NN_model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

##### d. Select the final model

In [None]:
model_filename = 'NN_model.h5'
NN_model.save(model_filename)

# 📰 Summary

In [None]:
# Create a DataFrame to store the summary table
model_names = ['Logistic Regression', 'Support Vector Machine', 'Gaussian Discriminant Analysis', 'Fead-Forward Neural Network']

hyperparameters = [
    grid_search_LR.best_params_,
    grid_search_SVC.best_params_,
    grid_search_GDA.best_params_,
    {'n_features': 29,
    'l2_regularization_strength': 0.01,
    'dropout_rate': 0.5,
    'optimizer': 'adam',
    'loss_function': 'binary_crossentropy',
    'metrics': ['accuracy'],
    'epochs': 50,
    'batch_size': 64,
    'class_weights': class_weights,  # Calculated manually
    'early_stopping_patience': 5,
    }
]
test_accuracy = [
    test_accuracy_LR * 100,
    test_accuracy_SVC * 100,
    test_accuracy_GDA * 100,
    accuracy * 100
]
summary_df = pd.DataFrame({
    'Model': model_names,
    'Test_Accuracy': test_accuracy,
    'Hyperparameters': hyperparameters,
})

display(summary_df)