In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import random
# from tensorflow.keras.models import load_model
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
data = pd.read_csv('data/MetroPT3(AirCompressor).csv', index_col = False)
data.head()

# 1. Data Cleaning and Preprocessing 

According to the documentation, the following preprocessing steps have been conducted before publishing the data:

- Data segmentation
- Normalization
- Feature Extraction

Thus, we do not need to apply them in our work.

### 1) Overview 

In [None]:
print(f'number of null values: {data.isna().sum().sum()}')

In [None]:
print(f'number of duplicates: {data.duplicated().sum()}')

In [None]:
print(f'shape: {data.shape}')

### 2) drop unnecessary columns:


In [None]:
# drop unecessary columns
data.drop(['Unnamed: 0'], axis = 1, inplace = True)

### 3) Add a label Column 
From the failure information table provided int the data description file below, we will try to label the data and evaluate the effectiveness of failure prediction algorithms: 

![alt text](image.png)

In [None]:
labeled_data = data.copy()
labeled_data['status'] = 0

#### Converting the timestamp column into pandas.DateTime data type


In [None]:
# converting the timestamp to datetime
labeled_data['timestamp'] = pd.to_datetime(labeled_data['timestamp'], format = '%Y-%m-%d %H:%M:%S')
print("current data type of timestamp: ", labeled_data['timestamp'].dtype)

In [None]:
#define function to convert time to pandas.dateTime 
def convert_time(X):
    result =[]
    for x in X:
        result.append(pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S'))
    return result

failure_start_time = convert_time(["2020-04-18 00:00:00", "2020-05-29 23:30:00", "2020-06-05 10:00:00", "2020-07-15 14:30:00"])
failure_end_time = convert_time(["2020-04-18 23:59:00", "2020-05-30 06:00:00", "2020-06-07 14:30:00", "2020-07-15 19:00:00"])


In [None]:
#iterate through the data and label the data
for start, end in zip(failure_start_time, failure_end_time):
    labeled_data.loc[(labeled_data['timestamp'] >= start) & (labeled_data['timestamp'] <= end), 'status'] = 1
    #check if any failures were missed or
    print(f"number of failures between {start} and {end}: {labeled_data.loc[(labeled_data['timestamp'] >= start) & (labeled_data['timestamp'] <= end), 'status'].sum()}")
    
print(f"number of failures: {labeled_data['status'].sum()}")

In [None]:
#check for positive class imbalance
print(f"Example of Failure state \n {labeled_data[labeled_data['status']==1].head()}")


### 4) Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

labeled_data.sort_values('timestamp', inplace = True)
y = labeled_data.status
X = labeled_data.copy().drop(columns=['status'])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   random_state=42, 
                                   test_size=0.25, 
                                   shuffle=True)

print(f'shape of X_train: {X_train.shape}')
print(f'shape of X_test: {X_test.shape}')
print(f'shape of y_train: {y_train.shape}')
print(f'shape of y_test: {y_test.shape}')

### 5) Balancing Data

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

The number of negative values (normal cases) is way too large compared to the positive class (around 22k positive samples and 1100k negative samples). Then, we are running into an Imbalaned Dataset. It is expected since we are dealing with a predictive maintenance problem.  
To address this issue, we ought to balance our data. There are various techniques to balance it. Here is a overview about some of them:  
* **Undersampling:** reduces the number of instances in the majority class to match the number of minority class instances by randomly selecting them. It is the fastest and most intuitive technique.
* **Oversampling:** increases the number of instances of the minority class by replicating or generating new instances.  
* **SMOTE (Synthetic Minority Oversampling Technique):** generates synthetic instances by interpolating between existing instances in the minority class.  
* **SMOTE Tomek:** identifies Tomek links (pairs of instances from different classes) and removes majority class instances from the pairs while oversampling the minority class using SMOTE. This technique takes significantly more time than the others.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

rus = RandomUnderSampler(random_state=42)
# ros = RandomOverSampler(random_state=42)
# smote = SMOTE(random_state=42)
# smote_tomek = SMOTETomek(random_state=42)

X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)

# reassemble the dataset for further preprocessing and EDA
balanced_train = X_train_balanced.copy()
balanced_train['status'] = y_train_balanced
balanced_train.head(3)

`balanced_train` is supposed to be balanced now. Let us check it:

In [None]:
y_train_balanced.value_counts()

In [None]:
# value counts from the imbalanced dataset
imbalanced_class_counts = y_train.value_counts()

# value counts from the balanced dataset
balanced_class_counts = y_train_balanced.value_counts()

# plot pie charts to show the class distribution difference

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].pie(
    imbalanced_class_counts,
    labels = ['Negative', 'Positive'],
    autopct = '%1.1f%%',
    startangle = 90,
    colors = ['lightpink', 'lightblue']
)
axes[0].set_title('Before Undersampling')

axes[1].pie(
    balanced_class_counts,
    labels = ['Negative', 'Positive'],
    autopct = '%1.1f%%',
    startangle = 90,
    colors = ['lightpink', 'lightblue']
)
axes[1].set_title('After Undersampling')

plt.tight_layout()
plt.show()

In [None]:
balanced_train.info()

### 6) Checking for outliers 


In [None]:
def identify_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    num_outliers = len(outliers)
    print(f"Number of outliers in {column}: {num_outliers}")
    return outliers

# def remove_outliers(data, column):
#     Q1 = data[column].quantile(0.25)
#     Q3 = data[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     outliers_removed = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
#     num_removed = len(data) - len(outliers_removed)
#     print(f"Number of outliers removed from {column}: {num_removed}\n")
#     return outliers_removed

# First, identify outliers
for col in balanced_train:
    if col not in ['timestamp', 'status']:
        outliers = identify_outliers(balanced_train, col)

the features: ['COMP', 'DV_eletric','Towers', 'MPG','LPS','Pressure_switch','Oil_level','Caudal_impulses'] are binary features. So we do not remove outliers.

In [None]:
# Investigate the columns with the binary values
binary_cols = ['LPS', 'Pressure_switch', 'Oil_level', 'Caudal_impulses']
# Ensure the the binary data is binary
balanced_train[binary_cols] = balanced_train[binary_cols].apply(np.round)

In [None]:
# count the number of unique values in each column
for col in balanced_train.columns:
    print(f"number of unique values in {col}: {balanced_train[col].nunique()}")

# 2. Exploratory data analysis

### 1) Correlation

In [None]:
# correlation 
correlation = balanced_train.corr()
plt.figure(figsize = (18, 10))
sns.heatmap(correlation, annot = True, fmt='.2f', cmap = 'coolwarm')
plt.title('Correlation Matrix')
plt.show()

From the above correlation heatmap,  we can see that our target feature **"status"** has a strong correlation with these features: TP2, H1, DV_pressure, Oil_temparature, Motor_current, COMP, DV_electric and MPG.

### 2) Visualization

1. Outliers

In [None]:
# visualize all the features outliers in one plot 
sns.set(rc={'figure.figsize':(20,8.27)})
# sns.boxplot(data = balanced_train.drop(['timestamp', 'status'], axis = 1))
sns.boxplot(data = balanced_train.drop(['status'], axis = 1))
# plt.xticks(rotation = 45)
plt.title('Boxplot of all features')
plt.show()

2. Probability distribution


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='seaborn')

#visualize the probability distribution of all the features
def plot_col_distribution(data):
    fig, axes = plt.subplots(4, 4, figsize = (20, 10))
    axes = axes.flatten()
    for i, col in enumerate(data.columns):
        data[col] = data[col].replace([np.inf, -np.inf], np.nan)
        sns.histplot(data[col], ax = axes[i], kde=True)
        axes[i].set_title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()
    
plot_col_distribution(balanced_train.drop(['timestamp', 'status'], axis = 1))
# plot_col_distribution(balanced_train.drop(['status'], axis = 1))


3. Time series plot

In [None]:
balanced_train.iloc[:,:16]

In [None]:
# reorganize according to timestamp 
balanced_train.sort_values('timestamp', inplace = True)

In [None]:
# Plot the time series
balanced_train.iloc[:,:16].plot(
        subplots =True,
        layout=(6, 3),
        figsize=(22,22),
        fontsize=10, 
        linewidth=1,
        sharex = False, 
        title='Visualization of the Original Time Series')
plt.show()

# 3. Modeling

#### **Taking out the Timestamp**

In [None]:
X_train_balanced = X_train_balanced.drop(columns=['timestamp'])
X_test = X_test.drop(columns=['timestamp'])
X_train = X_train.drop(columns=['timestamp'])

In [None]:
y_train_balanced.value_counts()

#### **Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.fit_transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train_balanced.columns)

X_train_scaled.describe()

In [None]:
y_test.value_counts()

#### **Feature Importance**

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# univariate features selection
k = 7  # Adjust this value as needed
selector = SelectKBest(score_func=f_classif, k=k)
X_selected= selector.fit_transform(X_train_balanced, y_train_balanced)

features = X_train_balanced.columns

# Create a DataFrame with feature names and their F-statistic scores
feature_scores = pd.DataFrame({'Feature': features, 'F-Score': selector.scores_})
feature_scores = feature_scores.sort_values(by='F-Score', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='F-Score', y='Feature', data=feature_scores, palette='viridis')
plt.title('Feature Importance (F-Scores)')
plt.xlabel('F-Score')
plt.ylabel('Feature')
plt.show()

#### **Removing highly correlated features (TP3, TP2, H1 and MPG)**

In [None]:
features_to_remove = ['TP3', 'TP2', 'MPG', 'Pressure_switch', 'LPS']

# Remove the specified features from the DataFrame
X_test.drop(features_to_remove, axis=1, inplace=True)
X_train_balanced.drop(features_to_remove, axis=1, inplace=True)
X_train.drop(features_to_remove, axis=1, inplace=True)
X_test_scaled.drop(features_to_remove, axis=1, inplace=True)
X_train_scaled.drop(features_to_remove, axis=1, inplace=True)


X_test.head(3)

#### **Evaluation Protocol**

Reminder that the aim of this study is to predict failures and the need of maintenance in an urban metro public transportation service. To assess the fit and how good each model performed, we will evaluate it using the following metrics:

* **Accuracy:** measures the overall correctness of the model since our data is balanced:

$$ Accuracy = \frac{TP + TN}{TP + TN + FP + FN} $$

* **Precision:** the goal of it is to maximize the positive (failure) predictions when they were originally failures (TP) and minimize false calls (FP) which are non-failure values predicted as failure by the model:

$$ Precision = \frac{TP}{TP + FP} $$

* **F1 Score**: harmonic mean of Precision and Recall:

$$ F1 Score = 2 \times \frac{Precision \times Recall}{Precision + Recall} $$

However, we will mostly focus on the $F1$ and $Precision$ scores to evaluate and validate our models and to deduce what is the model that best fits our data and generalizes to unseen data.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

We maintain a dataframe `scores` to hold the scores of each model we will study:

In [None]:
scores = pd.DataFrame(columns=['model', 'accuracy', 'precision', 'f1'])
scores # should be empty

### 1) Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_preds = dt.predict(X_test)

# accuracy
train_accuracy_dt = accuracy_score(y_train, dt.predict(X_train))
test_accuracy_dt = accuracy_score(y_test, dt_preds)

# precision
train_precision_dt = precision_score(y_train, dt.predict(X_train))
test_precision_dt = precision_score(y_test, dt_preds)

# f1 score
train_f1_dt = f1_score(y_train, dt.predict(X_train))
test_f1_dt = f1_score(y_test, dt_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_dt}')
print(f'  test: {test_accuracy_dt}')

print(f'\nprecision:')
print(f'  train: {train_precision_dt}')
print(f'  test: {test_precision_dt}')

print(f'\nf1:')
print(f'  train: {train_f1_dt}')
print(f'  test: {test_f1_dt}')

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    dt, X_train, y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='f1'
)

# Plot the learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training F1 Score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='Validation F1 Score')
plt.xlabel('Dataset Size')
plt.ylabel('Score')
plt.title('Learning Curve of Decision Tree')
plt.legend(loc='best')
plt.grid(True)
plt.show()

The naive Decision Tree solution performed great. However, we believe that we can still get better results for the precision and f1 scores.

Here, we are going to find the best Decision Tree Classifier. That is, we aim to find its parameters that best maximize the score. This is done by **Random Search**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

dt_params = {
    'criterion': ['gini'],
    'max_depth': randint(1, 50),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 10)
}

random_search_dt = RandomizedSearchCV(
    estimator=dt,
    param_distributions=dt_params,
    n_iter=100,
    cv=5,
    n_jobs=1,
    random_state=42,
    scoring='f1'
)

random_search_dt.fit(X_train, y_train)

best_params_dt = random_search_dt.best_params_
best_score_dt = random_search_dt.best_score_

print(f'best decision tree parameters: {best_params_dt}')
print(f'best score (F1): {best_score_dt}')

Now, we model with the found as best **Decision Tree** with the **Random Search**:

In [None]:
best_dt = random_search_dt.best_estimator_
best_dt.fit(X_train, y_train)

best_dt_preds = best_dt.predict(X_test)

# accuracy
train_accuracy_best_dt = accuracy_score(y_train, best_dt.predict(X_train))
test_accuracy_best_dt = accuracy_score(y_test, best_dt_preds)

# precision
train_precision_best_dt = precision_score(y_train, best_dt.predict(X_train))
test_precision_best_dt = precision_score(y_test, best_dt_preds)

# f1 score
train_f1_best_dt = f1_score(y_train, best_dt.predict(X_train))
test_f1_best_dt = f1_score(y_test, best_dt_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_best_dt}')
print(f'  test: {test_accuracy_best_dt}')

print(f'\nprecision:')
print(f'  train: {train_precision_best_dt}')
print(f'  test: {test_precision_best_dt}')

print(f'\nf1:')
print(f'  train: {train_f1_best_dt}')
print(f'  test: {test_f1_best_dt}')

In [None]:
# classification report
print(classification_report(y_test, best_dt_preds))

# confusion matrix
cm = confusion_matrix(y_test, best_dt_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_dt.classes_)
disp.plot()

We can directky notice that the model is able to capture many $True Positives$ data instances. Which is quite the desirable behaviour. To detect the $TPs$ (positive class and predicted as positive, aka: anomaly).

In [None]:
# add to scores dataframe
scores.loc[len(scores)] = ['Decision Tree', test_accuracy_best_dt, test_precision_best_dt, test_f1_best_dt]
scores

In [None]:
joblib.dump(best_dt, 'saved_models/decision_tree_model.joblib')
df_pred = pd.DataFrame({'Prediction': best_dt_preds})
df_pred.to_csv("predictions/decision_tree_model.csv", index=False)

### 2) Random Forest


A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. Trees in the forest use the best split strategy, i.e. equivalent to passing `splitter="best"` to the underlying DecisionTreeRegressor. The sub-sample size is controlled with the `max_samples` parameter if `bootstrap=True` (default), otherwise the whole dataset is used to build each tree.

**building a failure prediction model**

The goal is to employ the random forest classifier algorithm in order to predict failure labeled as 0 and 1 .

In [None]:
#build a random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf_preds = rf.predict(X_test)

# accuracy
train_accuracy_rf = accuracy_score(y_train, rf.predict(X_train))
test_accuracy_rf = accuracy_score(y_test, rf_preds)

# precision
train_precision_rf = precision_score(y_train, rf.predict(X_train))
test_precision_rf = precision_score(y_test, rf_preds)

# f1 score
train_f1_rf = f1_score(y_train, rf.predict(X_train))
test_f1_rf = f1_score(y_test, rf_preds)

print(f'accuracy:')
print(f'train: {train_accuracy_rf}')
print(f'test: {test_accuracy_rf}')

print(f'\nprecision:')
print(f'train: {train_precision_rf}')
print(f'test: {test_precision_rf}')

print(f'\nf1:')
print(f'train : {train_f1_rf}')
print(f'test: {test_f1_rf}')


In [None]:
# classification report
print(classification_report(y_test, rf_preds))

# confusion matrix
cm = confusion_matrix(y_test, rf_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()

**hyperparameters of Random Forest** 

Random forest has several hyperparameters that influence its performance: 
- `n_estimators`: The number of trees in the forest. Increasing this value generally improves performance until a certain point

- ` max_depth`: The maximum depth of each tree. Deeper trees may capture more complex relationships but this may lead to an overfitting

- `min_samples_split`: The minimum number of samples required to split an internal node.

- `min_samples_leaf`:  The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least `min_samples_leaf` training samples in each of the left and right branches. This may have the effect of smoothing the model

-  `max_features`: The maximum number of features to consider when looking for the best split.
 

We will fine-tune the model to achieve the most optimal performance by adjusting these hyperparametrs. Grid search and Cross-Validation common techniques for finding the best combination of hyperparameter values.

**Random search** is another powerful technique for optimizing the hyperparameters of a model. It works in a similar way to grid search cross-validation (GridSearchCV), but instead of searching over a predefined grid of hyperparameters, it samples them randomly from a distribution,  without becoming too computationally expensive.

In [None]:
# Randomized Search CV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

# Perform Randomized Search CV
Random_search = RandomizedSearchCV(rf, param_dist, cv=5, n_iter=10, n_jobs=-1, random_state=42)
Random_search.fit(X_train_balanced, y_train_balanced)

# Extract the cross-validation results and hyperparameters
cv_results = Random_search.cv_results_['mean_test_score']
best_params = Random_search.cv_results_['params']

# Get the best params 
best_params = Random_search.best_params_

# train the model with the best params
rf_best = Random_search.best_estimator_
rf_best.fit(X_train, y_train)

# Print the best hyperparameters
print('Best Hyperparameters:')
print(best_params)

# Evaluate the model on the test set
y_pred_rf = rf_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)


# Print results
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

# Display confusion matrix
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
disp_forest = ConfusionMatrixDisplay(conf_matrix_rf, display_labels=rf_best.classes_)
disp_forest.plot(cmap='Blues', values_format='d')
plt.show()

In [None]:
# add to scores dataframe
scores_rf_best = {
    'accuracy': accuracy,
    'precision': precision,
    'f1': f1
}

scores_rf_best_df = pd.DataFrame(scores_rf_best, index=[0])
scores.loc[len(scores)] = ['Best Random Forest', accuracy, precision, f1]
scores

**Conclusion**

From the Confusion matrix, we observe that the number of false positive and false negative has slightly decreased when using RandomSearch and hyperparamater tuning: 
- class 0: 20 -> 19
- class 1: 23 -> 21


In [None]:
joblib.dump(rf_best, 'saved_models/random_forest_model.joblib')
df_pred = pd.DataFrame({'Prediction': y_pred_rf})
df_pred.to_csv("predictions/random_forest_model.csv", index=False)

### 3) K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_balanced, y_train_balanced)

In [None]:
# Assuming you have already made predictions
y_pred = knn.predict(X_test)
y_pred_train = knn.predict(X_train_balanced)

# accuracy
train_accuracy = accuracy_score(y_train_balanced, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

# precision
train_precision = precision_score(y_train_balanced, y_pred_train)
test_precision = precision_score(y_test, y_pred)

# f1 score
train_f1 = f1_score(y_train_balanced, y_pred_train)
test_f1 = f1_score(y_test, y_pred)

print(f'accuracy:')
print(f'train: {train_accuracy}')
print(f'test: {test_accuracy}')

print(f'\nprecision:')
print(f'train: {train_precision}')
print(f'test: {test_precision}')

print(f'\nf1:')
print(f'train : {train_f1}')
print(f'test: {test_f1}')

our models accuracy is good, but the f1-score for our abnormal data is not good enough, giving that we are working on anomaly detection, it is best to focus more on the f1-score than the accuracy for the whole dataset

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn = KNeighborsClassifier()

f1_scorer = make_scorer(f1_score)

grid_search = GridSearchCV(knn, param_grid, scoring=f1_scorer, cv=5, n_jobs=-1)


grid_search.fit(X_train_balanced, y_train_balanced)

print("Best parameters found: ", grid_search.best_params_)
print("Best F1-score found: ", grid_search.best_score_)

best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Final F1-Score: ", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
disp_forest = ConfusionMatrixDisplay(conf_matrix, display_labels=best_knn.classes_)
disp_forest.plot(cmap='Blues', values_format='d')
plt.show()


In [None]:
scores.loc[len(scores)] = ['KNN', accuracy, precision, f1]
scores

In [None]:
joblib.dump(best_knn, 'saved_models/knn_model.joblib')
df_pred = pd.DataFrame({'Prediction': y_pred})
df_pred.to_csv("predictions/knn_model.csv", index=False)

### 4) Naive Bayes

**Gaussian Naive Bayes**

As we have numerical features and distribution of each feature that seems quite like a sample of a Gaussian distribution. For such reasons, we will use $Gaussian Naive Bayes$:

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_balanced, y_train_balanced)
gnb_preds = gnb.predict(X_test)

# accuracy
train_accuracy_gnb = accuracy_score(y_train_balanced, gnb.predict(X_train_balanced))
test_accuracy_gnb = accuracy_score(y_test, gnb_preds)
9967
# precision
train_precision_gnb = precision_score(y_train_balanced, gnb.predict(X_train_balanced))
test_precision_gnb = precision_score(y_test, gnb_preds)

# f1 score
train_f1_gnb = f1_score(y_train_balanced, gnb.predict(X_train_balanced))
test_f1_gnb = f1_score(y_test, gnb_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_gnb}')
print(f'  test: {test_accuracy_gnb}')

print(f'\nprecision:')
print(f'  train: {train_precision_gnb}')
print(f'  test: {test_precision_gnb}')

print(f'\nf1:')
print(f'  train: {train_f1_gnb}')
print(f'  test: {test_f1_gnb}')

In [None]:
# classification report
print(classification_report(y_test, gnb_preds))

# confusion matrix
cm = confusion_matrix(y_test, gnb_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=gnb.classes_)
disp.plot()

Although we made sure to remove any dependant features, Naive Bayes still performing badly. This can be due to many reasons:  
* So many outliers, which are among the main goal of this study. One outlier affects the mean and the variance. Thus, all the likelihood probabilities will be affected.  
* The imbalanced data (ground truth) that we had to balance alongside the fact that $Naive Bayes$ makes a strong assumption of independence between the features.
  
To address this issue, we will hyperparameter tune the $GaussianNB$ using the training data.

**Hyperparameterized Gaussian Naive Bayes**

The first parameter we trat is `var_smoothing`. It specifies the portion of the largest variance of all features to be added to variances for stability.  
The next one is to make our features more or less normally distributed. Because, real life data is hardly normal. To do that, we use `PowerTransformer`.  
Last but not least, we do cross validation using five splites repeated three time $Stratified K-Fold$

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

In [None]:
from sklearn.preprocessing import PowerTransformer

# parameters grid
param_grid_nb = {
    # smoothing variable represents 100 numbers from 0 to -9 spaced evenly on a log scale
    'var_smoothing': np.logspace(0, -9, num=100),
}

# initializing the random search
random_search_gnb = RandomizedSearchCV(
    estimator=gnb,
    param_distributions=param_grid_nb,
    n_iter=100,
    cv=cv_method,
    random_state=42,
    scoring='f1'
)

# fitting the random search
X_test_transformed = PowerTransformer().fit_transform(X_test)
random_search_gnb.fit(X_test_transformed, y_test)

best_params_gnb = random_search_gnb.best_params_
best_score_gnb = random_search_gnb.best_score_

print(f'best Gaussian Naive Bayes parameters: {best_params_gnb}')
print(f'best score (F1): {best_score_gnb}')

Great results in train so far.  
Now, we apply that actually on the test:

In [None]:
best_gnb = random_search_gnb.best_estimator_
best_gnb.fit(X_train_balanced, y_train_balanced)

best_gnb_preds = best_gnb.predict(X_test)

# accuracy
train_accuracy_best_gnb = accuracy_score(y_train_balanced, best_gnb.predict(X_train_balanced))
test_accuracy_best_gnb = accuracy_score(y_test, best_gnb_preds)

# precision
train_precision_best_gnb = precision_score(y_train_balanced, best_gnb.predict(X_train_balanced))
test_precision_best_gnb = precision_score(y_test, best_gnb_preds)

# f1 score
train_f1_best_gnb = f1_score(y_train_balanced, best_gnb.predict(X_train_balanced))
test_f1_best_gnb = f1_score(y_test, best_gnb_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_best_gnb}')
print(f'  test: {test_accuracy_best_gnb}')

print(f'\nprecision:')
print(f'  train: {train_precision_best_gnb}')
print(f'  test: {test_precision_best_gnb}')

print(f'\nf1:')
print(f'  train: {train_f1_best_gnb}')
print(f'  test: {test_f1_best_gnb}')

Same with smoothing Gaussian Naive Bayes.  
One method to handle the data imbalance is to use $ComplementNB$ which takes that into consideration.

**Complement Naive Bayes**

In [None]:
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB(fit_prior=False, norm=False)
cnb.fit(X_train_scaled, y_train_balanced)
cnb_preds = cnb.predict(X_test_scaled)

# accuracy
train_accuracy_cnb = accuracy_score(y_train_balanced, cnb.predict(X_train_scaled))
test_accuracy_cnb = accuracy_score(y_test, cnb_preds)

# precision
train_precision_cnb = precision_score(y_train_balanced, cnb.predict(X_train_scaled))
test_precision_cnb = precision_score(y_test, cnb_preds)

# f1 score
train_f1_cnb = f1_score(y_train_balanced, cnb.predict(X_train_scaled))
test_f1_cnb = f1_score(y_test, cnb_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_cnb}')
print(f'  test: {test_accuracy_cnb}')

print(f'\nprecision:')
print(f'  train: {train_precision_cnb}')
print(f'  test: {test_precision_cnb}')

print(f'\nf1:')
print(f'  train: {train_f1_cnb}')
print(f'  test: {test_f1_cnb}')

This one went worse than the previous techniques of Naive Bayes.

In [None]:
# add to scores dataframe
scores.loc[len(scores)] = ['Gaussian Naive Bayes', test_accuracy_gnb, test_precision_gnb, test_f1_gnb]
scores

As noticeable, Gaussian Naive Bayes did perform quite well. Yet, worse than previous classifiers. This is due to the nature of the features, half of them follow a similar to Normal distribution and the rest follow a Bernoulli distribution.

In [None]:
joblib.dump(gnb, 'saved_models/naive_bayes_model.joblib')
df_pred = pd.DataFrame({'Prediction': best_gnb_preds})
df_pred.to_csv("predictions/naive_bayes_model.csv", index=False)

### 5) Support Vector Machines (SVM)

**Support Vector Classifier**

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', degree=3)
svm.fit(X_train_balanced, y_train_balanced)
svm_preds = svm.predict(X_test)

# accuracy
train_accuracy_svm = accuracy_score(y_train_balanced, svm.predict(X_train_balanced))
test_accuracy_svm = accuracy_score(y_test, svm_preds)

# precision
train_precision_svm = precision_score(y_train_balanced, svm.predict(X_train_balanced))
test_precision_svm = precision_score(y_test, svm_preds)

# f1 score
train_f1_svm = f1_score(y_train_balanced, svm.predict(X_train_balanced))
test_f1_svm = f1_score(y_test, svm_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_svm}')
print(f'  test: {test_accuracy_svm}')

print(f'\nprecision:')
print(f'  train: {train_precision_svm}')
print(f'  test: {test_precision_svm}')

print(f'\nf1:')
print(f'  train: {train_f1_svm}')
print(f'  test: {test_f1_svm}')

**Hyperparameterized SVC**

In [None]:
# parameters grid
param_grid_svc = {
    'kernel': ['rbf', 'linear', 'poly'],
    'degree': [1, 2, 3, 4, 5]
}

# initializing the random search
random_search_svc = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_grid_svc,
    cv=5,
    random_state=42,
    scoring='f1'
)

# fitting the random search
random_search_svc.fit(X_train_balanced, y_train_balanced)

best_params_svc = random_search_svc.best_params_
best_score_svc = random_search_svc.best_score_

print(f'best SVC parameters: {best_params_svc}')
print(f'best score (F1): {best_score_svc}')

In [None]:
best_svc = random_search_svc.best_estimator_
best_svc.fit(X_train_balanced, y_train_balanced)

best_svc_preds = best_svc.predict(X_test)

# accuracy
train_accuracy_best_svc = accuracy_score(y_train_balanced, best_svc.predict(X_train_balanced))
test_accuracy_best_svc = accuracy_score(y_test, best_svc_preds)

# precision
train_precision_best_svc = precision_score(y_train_balanced, best_svc.predict(X_train_balanced))
test_precision_best_svc = precision_score(y_test, best_svc_preds)

# f1 score
train_f1_best_svc = f1_score(y_train_balanced, best_svc.predict(X_train_balanced))
test_f1_best_svc = f1_score(y_test, best_svc_preds)

print(f'accuracy:')
print(f'  train: {train_accuracy_best_svc}')
print(f'  test: {test_accuracy_best_svc}')

print(f'\nprecision:')
print(f'  train: {train_precision_best_svc}')
print(f'  test: {test_precision_best_svc}')

print(f'\nf1:')
print(f'  train: {train_f1_best_svc}')
print(f'  test: {test_f1_best_svc}')

In [None]:
# classification report
print(classification_report(y_test, best_svc_preds))

# confusion matrix
cm = confusion_matrix(y_test, best_svc_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_svc.classes_)
disp.plot()

In [None]:
# add to scores dataframe
scores.loc[len(scores)] = ['SVM', test_accuracy_svm, test_precision_svm, test_f1_svm]
scores

In [None]:
joblib.dump(svm, 'saved_models/svm_model.joblib')
df_pred = pd.DataFrame({'Prediction': best_svc_preds})
df_pred.to_csv("predictions/svm_model.csv", index=False)

### 6) Artificial Neural Networks (ANN)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(128, input_shape=(10, ), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import backend as K

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define precision metric
def precision_m(y_true, y_pred):
    y_true = K.cast(y_true, 'float32')
    y_pred = K.cast(y_pred, 'float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

# Define recall metric
def recall_m(y_true, y_pred):
    y_true = K.cast(y_true, 'float32')
    y_pred = K.cast(y_pred, 'float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Define F1 score metric
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=['accuracy', precision_m, recall_m, f1_m])
model.fit(X_train_balanced, y_train_balanced, epochs=100)

preds = model.predict(X_test)

In [None]:
preds_test = np.round(model.predict(X_test))
accuracy = accuracy_score(preds_test, y_test)
print('the accuracy score for the neural network model in testing is: ', accuracy)

preds_train = np.round(model.predict(X_train_balanced))
accuracy = accuracy_score(preds_train, y_train_balanced)
print('the accuracy score for the neural network model in training is: ', accuracy)

# Evaluate the model
print("Final F1-Score: ", f1_score(y_test, preds_test))
print("Classification Report:\n", classification_report(y_test, preds_test))

In [None]:
from scikeras.wrappers import KerasClassifier

def create_model(optimizer='adam', init='glorot_uniform', activation='relu', neurons=256):
    model = Sequential()
    model.add(Dense(neurons, input_shape=(X_train_balanced.shape[1],), kernel_initializer=init, activation=activation))
    model.add(Dense(neurons // 2, kernel_initializer=init, activation=activation))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter grid
param_grid = {
    'optimizer': ['adam', 'sgd'],
    'init': ['glorot_uniform', 'normal', 'uniform'],
    'activation': ['relu', 'tanh'],
    'neurons': [64, 128, 256],
    'batch_size': [16, 32, 64],
    'epochs': [50, 100, 150]
}


In [None]:
model = KerasClassifier(model=create_model, verbose=0, neurons=256, init='glorot_uniform', activation='relu')

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, scoring='f1', cv=3, verbose=1, n_jobs=-1)

random_search_result = random_search.fit(X_train_balanced, y_train_balanced)

print("Best parameters found: ", random_search_result.best_params_)
print("Best F1-score found: ", random_search_result.best_score_)

best_model = random_search_result.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
preds_train = np.round(best_model.predict(X_train_balanced))

print('Evaluation of the model on training data:')
accuracy = accuracy_score(y_train_balanced, preds_train)
precision = precision_score(y_train_balanced, preds_train)
recall = recall_score(y_train_balanced, preds_train)
f1 = f1_score(y_train_balanced, preds_train)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

In [None]:
best_model.model_.save('saved_models/ann_model.keras')
df_pred = pd.DataFrame({'Prediction': y_pred})
df_pred.to_csv("predictions/ann_model.csv", index=False)

# 4. Comparative Analysis

In [None]:
saved_models_dir = 'saved_models'
saved_predictions_dir = 'predictions'

models = {}

for filename in os.listdir(saved_models_dir):

    filepath = os.path.join(saved_models_dir, filename)
    
    if filename.endswith('.joblib'):
        model = ""
    elif filename.endswith('.keras'):
        model = ""
    else:
        # Skip if the file is not a supported model type
        continue
    
    model_name = os.path.splitext(filename)[0]
    
    models[model_name] = model


In [None]:
def evaluate_model(y_test, model_name):
    
    saved_predictions_path = saved_predictions_dir + '/' + model_name + '.csv'

    if os.path.exists(saved_predictions_path):
        preds = pd.read_csv(saved_predictions_path)
        preds = preds['Prediction']
    else:
        print("prediction doesn't exits")
    
    
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    
    return accuracy, precision, recall, f1, cm, preds

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

results = []
for name, model in models.items():
    accuracy, precision, recall, f1, cm, preds = evaluate_model(model, X_test, y_test, name)
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': cm,
        'predictions': preds
    })

results_df = pd.DataFrame(results)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 16))

axes = axes.flatten()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
best_models = {}
worst_models = {}

for i, metric in enumerate(metrics):
    sns.barplot(x='Model', y=metric, data=results_df, palette='viridis', ax=axes[i])
    axes[i].set_title(f'Model Comparison - {metric}')
    axes[i].set_xlabel('Model')
    axes[i].set_ylabel(metric)
    
    # Find best and worst model for each metric
    best_model = results_df.loc[results_df[metric].idxmax()]['Model']
    worst_model = results_df.loc[results_df[metric].idxmin()]['Model']
    best_models[metric] = best_model
    worst_models[metric] = worst_model

for ax in axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

# Adjust layout
plt.tight_layout()
plt.show()

# Print best and worst models for each metric
for metric, best_model in best_models.items():
    print(f'Best model for {metric}: {best_model}')
    
for metric, worst_model in worst_models.items():
    print(f'Worst model for {metric}: {worst_model}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

fpr_list = []
fnr_list = []
models = []

for result in results:
    cm = result['Confusion Matrix']
    TN, FP, FN, TP = cm.ravel()
    
    FPR = FP / (FP + TN)
    FNR = FN / (FN + TP)
    
    fpr_list.append(FPR)
    fnr_list.append(FNR)
    models.append(result['Model'])

# Create bar charts for FPR and FNR
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Bar chart for FPR
sns.barplot(x=models, y=fpr_list, ax=ax[0], palette='Blues_d')
ax[0].set_title('False Positive Rate (FPR)')
ax[0].set_ylabel('Rate')
ax[0].set_xlabel('Model')

# Bar chart for FNR
sns.barplot(x=models, y=fnr_list, ax=ax[1], palette='Blues_d')
ax[1].set_title('False Negative Rate (FNR)')
ax[1].set_ylabel('Rate')
ax[1].set_xlabel('Model')

for ax in ax:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()


In [None]:
# Define profit and loss values
profit_tp = 100
profit_tn = 0
loss_fp = 10
loss_fn = 700

profits_losses = []

for index, row in results_df.iterrows():
    
    tn, fp, fn, tp = row['Confusion Matrix'].ravel()
    
    profit = tp * profit_tp + tn * profit_tn
    loss = fp * loss_fp + fn * loss_fn
    net_profit_loss = profit - loss
    
    profits_losses.append(net_profit_loss)

results_df['Profit_Loss'] = profits_losses

plt.figure(figsize=(12, 8))
sns.barplot(x='Model', y='Profit_Loss', data=results_df, palette='viridis')
plt.title('Model Comparison - Profit and Loss')
plt.xlabel('Model')
plt.ylabel('Profit and Loss')
plt.xticks(rotation=45)
plt.show()

# Print the best and worst models for profit and loss
best_profit_model = results_df.loc[results_df['Profit_Loss'].idxmax()]
worst_profit_model = results_df.loc[results_df['Profit_Loss'].idxmin()]
print(f"Best Profit Model: {best_profit_model['Model']} with Profit and Loss: {best_profit_model['Profit_Loss']}")
print(f"Worst Profit Model: {worst_profit_model['Model']} with Profit and Loss: {worst_profit_model['Profit_Loss']}")
