In [1]:
# Relevant libraries
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') 
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_auc_score, roc_curve

In [2]:
x_train_scaled = pd.read_csv('Preprocessed Data/X_train_scaled.csv')
x_test_scaled = pd.read_csv('Preprocessed Data/X_test_scaled.csv')
y_train = pd.read_csv('Preprocessed Data/y_train.csv')
y_test = pd.read_csv('Preprocessed Data/y_test.csv')

print(x_train_scaled.shape)
print(x_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)


(57510, 16)
(10642, 16)
(57510, 1)
(10642, 1)


# Load the Preprocessed Data

In [4]:
# To avail the classes notebook
%run Phase_3_Project_Classes.ipynb 

# Path to the folder where the data is saved
output_path = "Preprocessed Data/"

# Load the datasets needed for modeling
X_train_scaled = pd.read_csv(output_path + 'X_train_scaled.csv')
X_test_scaled = pd.read_csv(output_path + 'X_test_scaled.csv')
y_train = pd.read_csv(output_path + 'y_train.csv')
y_test = pd.read_csv(output_path + 'y_test.csv')

# Verify that the data has been loaded correctly
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



X_train_scaled shape: (57510, 16)
X_test_scaled shape: (10642, 16)
y_train shape: (57510, 1)
y_test shape: (10642, 1)


In [5]:
# Check class distribution in training set
print(y_train.value_counts())


Status_Group
0.0             28755
1.0             28755
Name: count, dtype: int64


# Modelling

## Model 1: Logistic Regression: Baseline Model
The Logistic Regression model is implemented as a baseline to establish a point of comparison for evaluating other models. It uses the default configuration with standard hyperparameters. It serves as a starting point without applying advanced tuning or feature engineering, providing a benchmark for model performance

In [8]:
# Initialize the model
trainer = ModelTrainer(X_train_scaled, y_train, X_test_scaled, y_test)


# Train the models
logistic_model = trainer.logistic_regression()

## Model 2. Cross-validated Untuned Decision Tree Classifier
The untuned Decision Tree Classifier is trained using the training dataset without any hyperparameter optimization.
The model is cross-validated during training to evaluate its performance on multiple subsets of the data. This serves as a foundation for later the comparison with a tuned decision tree classifier.

In [10]:
# Train the models
untuned_dt_model = trainer.untuned_decision_tree()

## Model 3 : Tuned and Cross Validated Decsion Tree Classifier
This model leverages hyperparameter tuning and cross-validation to optimize the Decision Tree Classifier's performance. It uses GridSearchCV to systematically explore combinations of hyperparameters 

In [None]:
# Train the models
tuned_dt_model = trainer.tuned_decision_tree()

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


# Model Evaluations

## Model 1: Logistic Regression

In [None]:
# Initialize the evaluator
evaluator = ModelEvaluator(X_train_scaled, y_train, X_test_scaled, y_test)

# Train the models
logistic_model = trainer.logistic_regression()


# Evaluate the models
logistic_results = evaluator.evaluate(logistic_model)

# Print evaluation results
print("Logistic Regression Evaluation:")
print(logistic_results)

## Model 2. Cross-validated Untuned Decision Tree Classifier

In [None]:
# Train the models
untuned_dt_model = trainer.untuned_decision_tree()

# Evaluate the models
untuned_dt_results = evaluator.evaluate(untuned_dt_model)

# Print evaluation results
print("\nUntuned Decision Tree Evaluation:")
print(untuned_dt_results)



## Model 3 : Tuned and Cross Validated Decsion Tree Classifier

In [None]:
# Train the models
tuned_dt_model = trainer.tuned_decision_tree()

# Evaluate the models
tuned_dt_results = evaluator.evaluate(tuned_dt_model)


print("\nTuned Decision Tree Evaluation:")
print(tuned_dt_results)


## Choosing the best Model

In [None]:
# ROC-AUC Curve for Logistic Regression (Model 1)
fpr_lr, tpr_lr, _ = roc_curve(y_test, Baseline_Model.predict_proba(X_test_scaled)[:, 1])
roc_auc_lr = auc(fpr_lr, tpr_lr)

# ROC-AUC Curve for Decision Tree (Untuned Model 2)
fpr_dt, tpr_dt, _ = roc_curve(y_test, decision_tree.predict_proba(X_test_scaled)[:, 1])
roc_auc_dt = auc(fpr_dt, tpr_dt)

# ROC-AUC Curve for Tuned Decision Tree (Model 3)
fpr_tuned, tpr_tuned, _ = roc_curve(y_test, best_dt_model.predict_proba(X_test_scaled)[:, 1])
roc_auc_tuned = auc(fpr_tuned, tpr_tuned)

# Plot all ROC curves on the same graph
plt.figure(figsize=(10, 8))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_lr:.2f})', color='blue')
plt.plot(fpr_dt, tpr_dt, label=f'Untuned Decision Tree (AUC = {roc_auc_dt:.2f})', color='red')
plt.plot(fpr_tuned, tpr_tuned, label=f'Tuned Decision Tree (AUC = {roc_auc_tuned:.2f})', color='green')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random Classifier')

# Add plot details
plt.title('Comparison of ROC Curves for All Models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()


Model 3 (Tuned Decision Tree) is the best model because of its smooth curve and its proximity to the top-left corner, indicating it has the highest overall classification performance.

Model 2 (Untuned Decision Tree) is better than Model 1 (Logistic Regression), but still underperforms compared to the tuned decision tree due to sharp turns in the curve and overfitting signs.

Model 1 (Logistic Regression) is the least effective of the three, but still provides a stable performance, though it's not as well-suited for this problem as the tuned decision tree.

The Tuned Decision Tree (Model 3) should be chosen as the best model based on its ROC curve performance.


## Conclusions

#### 1. Model Performance:
Model 1 (Logistic Regression): Achieved moderate accuracy but struggled with false positives and false negatives. The absence of test_labels for validation hindered model performance assessment.

Model 2 (Untuned Decision Tree): Overfitted on the training data and performed poorly on the test set. test_labels were missing, affecting performance evaluation.

Model 3 (Tuned Decision Tree): Showed improvement in generalization, but again, the lack of test_labels prevented a proper performance evaluation.

##### 2. Class Imbalance:
Class imbalance was effectively managed through a combination of the SMOTE and Class Weights, but the missing test_labels compromised the ability to assess model performance effectively.

#### 3. Test Labels Issue:
The lack of test_labels for the test set prevented reliable evaluation of the models. This needs to be addressed to ensure accurate model assessment moving forward.




## Recommendations

#### 1.Feature Engineering:
Continue with feature engineering, but ensure test_labels are available for evaluating how the new features impact model performance.

#### 2. Additional Data Collection:
Make sure test_labels are included in future data sets, as they are essential for accurate model evaluation and performance tracking.

#### 3. Model Enhancements:
Advanced models like Random Forests or Gradient Boosting Machines should be tested, but ensure proper validation using test_labels.

#### 4. Evaluation Improvements:
Ensure that test_labels are available for all test sets to compute key performance metrics and make informed decisions about model improvements.



## Next Steps

- Deployment: Prepare to deploy the tuned decision tree model once test_labels are included in the testing phase for accurate performance evaluation.
- Iterative Data Collection and Testing: Update datasets with test_labels to ensure proper validation and further model tuning.
- Future Exploration: Explore ensemble methods and ensure test_labels are always available for robust evaluation of model performance.
