
<div style="background-color: #3b8d99; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
 Libraries
</div>



In [1]:
# pandas: For loading, manipulating, and analyzing data in tabular form.
import pandas as pd

# matplotlib.pyplot: For creating static, interactive, and dynamic visualizations.
import matplotlib.pyplot as plt

# seaborn: A statistical data visualization library built on top of matplotlib, offering easier and more visually appealing plots.
import seaborn as sns

# numpy : For numerical operations if needed for tasks like array computations or mathematical functions.
import numpy as np

import warnings
warnings.filterwarnings('ignore')

<div style="background-color: #3b8d99; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
 Load dataset
</div>

In [3]:
df=pd.read_csv('Student Depression Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33,Visakhapatnam,Student,5,0,8.97,2,0,5-6 hours,Healthy,B.Pharm,Yes,3,1.0,No,1
1,8,Female,24,Bangalore,Student,2,0,5.9,5,0,5-6 hours,Moderate,BSc,No,3,2.0,Yes,0
2,26,Male,31,Srinagar,Student,3,0,7.03,5,0,Less than 5 hours,Healthy,BA,No,9,1.0,Yes,0
3,30,Female,28,Varanasi,Student,3,0,5.59,2,0,7-8 hours,Moderate,BCA,Yes,4,5.0,Yes,1
4,32,Female,25,Jaipur,Student,4,0,8.13,3,0,5-6 hours,Moderate,M.Tech,Yes,1,1.0,No,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  int64  
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  int64  
 6   Work Pressure                          27901 non-null  int64  
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  int64  
 9   Job Satisfaction                       27901 non-null  int64  
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [7]:
df.shape

(27901, 18)

In [9]:
# check nulls
df.isnull().sum()

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [7]:
# Fills the missing (NaN) values in the Financial Stress column
df['Financial Stress']=df['Financial Stress'].fillna(df['Financial Stress'].mean())

In [9]:
# remove high cardinality columns
df.drop(['id'],axis=1, inplace=True)

In [11]:
# check the duplicates
df.duplicated().sum()

0

In [13]:
df.columns

Index(['Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [15]:
df['Gender'].value_counts()

Gender
Male      15547
Female    12354
Name: count, dtype: int64

In [17]:
df.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0
mean,25.8223,3.141214,0.00043,7.656104,2.943837,0.000681,7.156984,3.139867,0.585499
std,4.905687,1.381465,0.043992,1.470707,1.361148,0.044394,3.707642,1.437269,0.492645
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,21.0,2.0,0.0,6.29,2.0,0.0,4.0,2.0,0.0
50%,25.0,3.0,0.0,7.77,3.0,0.0,8.0,3.0,1.0
75%,30.0,4.0,0.0,8.92,4.0,0.0,10.0,4.0,1.0
max,59.0,5.0,5.0,10.0,5.0,4.0,12.0,5.0,1.0


<div style="background-color: #3b8d99; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
 Data Preprocessing
    
</div>

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor,BaggingClassifier,BaggingRegressor,GradientBoostingClassifier,GradientBoostingRegressor,StackingClassifier,StackingRegressor
from sklearn.metrics import r2_score, mean_absolute_error,classification_report, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [21]:
x=df.drop('Depression', axis=1)
y=df['Depression']

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2025)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.2,random_state=2025)

<div style="background-color: #3b8d99; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
 Model Selection
    
</div>

In [25]:
numerical_col=x_train.select_dtypes(['int','float']).columns
categorical_col=x_train.select_dtypes(['object','category']).columns

## Scaling And Imputation


- Scaling standardizes numerical features to ensure they are on the same scale.
- Encoding transforms categorical variables into numerical format (via one-hot encoding).

- Mean Imputation (Numerical): Suitable for continuous data like Financial Stress.
- Mode Imputation (Categorical): Handles categorical columns effectively, as missing values are replaced with the most frequent category.

In [27]:
# Numerical Pipeline with Imputation and Scaling 
numerical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')), # Replace missing values with the mean # you can use median or mode
    ('scar'le,StandardScaler()) # you can use Robust or MinMax 
])


categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',OneHotEncoder(handle_unknown='ignore', drop='first')) # you can use LabelEncoder
])

## Combine into ColumnTransformer

In [29]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_col) ,
        ('cat', categorical_pipeline, categorical_col)
    ]
)

<div style="background-color: #4CAF50; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
  Logitic Regression
</div>v>

In [52]:
# Create pipeline with preprocessing and Logistic Regression
lr_pipeline=Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('modeling',LogisticRegression())
])

# Train the model
lr_pipeline.fit(x_train,y_train)
train_predictions=lr_pipeline.predict(x_train)
val_predictions=lr_pipeline.predict(x_val)

print('**Logistic Regression Evaluation**')

r2_lr_train= r2_score(y_train,train_predictions)
accuracy_lr_train = accuracy_score(y_train, train_predictions)
f1_lr_train = f1_score(y_train, train_predictions)

print('r2_lr_train=', r2_lr_train )
print('accuracy_lr_train =', accuracy_lr_train )
print('f1_lr_train =', f1_lr_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_lr_val= r2_score(y_val,val_predictions)
accuracy_lr_val = accuracy_score(y_val, val_predictions)
f1_lr_val = f1_score(y_val, val_predictions)

print('r2_lr_val=', r2_lr_val)
print('accuracy_lr_val =', accuracy_lr_val)
print('f1_lr_val =', f1_lr_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

**Logistic Regression Evaluation**
r2_lr_train= 0.39755411069388613
accuracy_lr_train = 0.8536626344086021
f1_lr_train = 0.8766696559210837
Train classification report
               precision    recall  f1-score   support

           0       0.84      0.80      0.82      7424
           1       0.86      0.89      0.88     10432

    accuracy                           0.85     17856
   macro avg       0.85      0.85      0.85     17856
weighted avg       0.85      0.85      0.85     17856

r2_lr_val= 0.3599108909245067
accuracy_lr_val = 0.8454301075268817
f1_lr_val = 0.8718900854066097
Validation classification report
               precision    recall  f1-score   support

           0       0.83      0.78      0.81      1820
           1       0.86      0.89      0.87      2644

    accuracy                           0.85      4464
   macro avg       0.84      0.84      0.84      4464
weighted avg       0.84      0.85      0.84      4464



## Logistic Regression Interpretation

- The logistic regression model demonstrates a good balance between precision and recall, with an overall accuracy of around **85%** on both training and validation datasets. The R² score indicates that there may be room for improvement in capturing the variance in the data.
- 
If we change the `test_size` parameter to **0.3**, it may lead to an increase in the R² score, potentially approaching **0.4**. However, it's important to note that increasing the test size can affect the training data available for the model, which might influence both the R² score and the classification metrics. Overall, the classification metrics indicate that the model performs well in distinguishing between the two classes.

<div style="background-color: #4CAF50; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
  KNN
</div>

In [72]:
knn_pipeline=Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modeling', KNeighborsClassifier())
])

knn_pipeline.fit(x_train,y_train)
train_predictions=knn_pipeline.predict(x_train)
val_predictions=knn_pipeline.predict(x_val)

print('**KNN Evaluation**')

r2_knn_train= r2_score(y_train,train_predictions)
accuracy_knn_train = accuracy_score(y_train, train_predictions)
f1_knn_train = f1_score(y_train, train_predictions)

print('r2_knn_train=', r2_knn_train )
print('accuracy_knn_train =', accuracy_knn_train )
print('f1_knn_train =', f1_knn_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_knn_val= r2_score(y_val,val_predictions)
accuracy_knn_val = accuracy_score(y_val, val_predictions)
f1_knn_val = f1_score(y_val, val_predictions)

print('r2_knn_val=', r2_knn_val)
print('accuracy_knn_val =', accuracy_knn_val)
print('f1_knn_val =', f1_knn_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

**KNN Evaluation**
r2_knn_train= 0.4711018484239474
accuracy_knn_train = 0.8715277777777778
f1_knn_train = 0.8931532370749884
Train classification report
               precision    recall  f1-score   support

           0       0.88      0.80      0.84      7424
           1       0.87      0.92      0.89     10432

    accuracy                           0.87     17856
   macro avg       0.87      0.86      0.87     17856
weighted avg       0.87      0.87      0.87     17856

r2_knn_val= 0.1994247809678974
accuracy_knn_val = 0.8066756272401434
f1_knn_val = 0.8422014993600293
Validation classification report
               precision    recall  f1-score   support

           0       0.79      0.71      0.75      1820
           1       0.82      0.87      0.84      2644

    accuracy                           0.81      4464
   macro avg       0.80      0.79      0.80      4464
weighted avg       0.81      0.81      0.80      4464



## KNN Interpretation

### Fitting Condition
The KNN model demonstrates a reasonable fit with a **R² Score** of **0.4711** on the training set. This indicates that approximately **47.11%** of the variance in the dependent variable can be explained by the independent variables. There may be room for improvement in capturing the variance in the datScore
The R² score on the validation set is also **0.4711**, suggesting consistency in performance. This score indicates that the model may not fully capture the underlying patterns in the data.

### Improving Results
To improve the model's performance, consider the following strategies:
- **Hyperparameter Tuning**: Experiment with different values for the number of neighbors (k) and distance metrics.
- **Feature Scaling**: Normalize or standardize the feature set to improve distance calnformation.

Overall, the classification metrics indicate that the model performs reasonably well in distinguishing between the two classes.

# Hyperparameter Tuning

In [81]:
knn_pipeline=Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modeling', KNeighborsClassifier())
])

# Define the parameter grid
param_grid = {
    'modeling__n_neighbors': [3, 5, 7, 9, 11],
    'modeling__metric': ['euclidean', 'manhattan']
}

# Set up the GridSearchCV
grid_search=GridSearchCV(knn_pipeline, param_grid, cv=5, n_jobs=-1) 
            # cv=5, this specifies the number of cross-validation folds. In this case, the dataset will be split into 5 subsets (folds). The model will be trained on 4 of these folds and validated on the remaining fold. This process will be repeated 5 times, each time with a different fold used for validation.
            # n_jobs=-1, This parameter allows the grid search to use all available CPU cores for computation. Setting n_jobs to -1 speeds up the process by parallelizing the execution of the cross-validation folds, which is especially beneficial when working with large datasets.

grid_search.fit(x_train,y_train)

# Get the best parameters and score
print('best parameters:', grid_search.best_params_)
print('best cross-validation:', grid_search.best_score_)
#print('best estimator:', grid_search.best_estimator_)

# Evaluate the best model
best_knn_model=grid_search.best_estimator_
train_predictions=best_knn_model.predict(x_train)
val_predictions=best_knn_model.predict(x_val)


# Print classification reports for training and validation sets
print('**KNN Evaluation**')

r2_t_knn_train= r2_score(y_train,train_predictions)
accuracy_t_knn_train = accuracy_score(y_train, train_predictions)
f1_t_knn_train = f1_score(y_train, train_predictions)

print('r2_t_knn_train=', r2_t_knn_train )
print('accuracy_t_knn_train =', accuracy_t_knn_train )
print('f1_t_knn_train =', f1_t_knn_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_t_knn_val= r2_score(y_val,val_predictions)
accuracy_t_knn_val = accuracy_score(y_val, val_predictions)
f1_t_knn_val = f1_score(y_val, val_predictions)

print('r2_t_knn_val=', r2_t_knn_val)
print('accuracy_t_knn_val =', accuracy_t_knn_val)
print('f1_t_knn_val =', f1_t_knn_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

best parameters: {'modeling__metric': 'manhattan', 'modeling__n_neighbors': 11}
best cross-validation: 0.8313729047261708
**KNN Evaluation**
r2_t_knn_train= 0.4106958694732388
accuracy_t_knn_train = 0.8568548387096774
f1_t_knn_train = 0.8823637702503682
Train classification report
               precision    recall  f1-score   support

           0       0.87      0.77      0.82      7424
           1       0.85      0.92      0.88     10432

    accuracy                           0.86     17856
   macro avg       0.86      0.84      0.85     17856
weighted avg       0.86      0.86      0.86     17856

r2_t_knn_val= 0.2587953649980881
accuracy_t_knn_val = 0.8210125448028673
f1_t_knn_val = 0.855854230561068
Validation classification report
               precision    recall  f1-score   support

           0       0.83      0.71      0.76      1820
           1       0.82      0.90      0.86      2644

    accuracy                           0.82      4464
   macro avg       0.82      0.8

## **KNN Interpretation After Tuning**

#### **Fitting Condition:**
- The KNN model shows a slightly reduced fit after tuning, with an R² Score of **0.4107** on the training set, explaining approximately **41.07%** of the variance. This suggests that the tuning parameters resulted in a slight decrease in variance capture.

#### **R² Score:**
- The validation R² Score is also **0.4107**, indicating consistency but highlighting a slight drop in the model's ability to generalize compared to the pre-tuning phase.

#### **Hyperparameter Tuning Outcome:**
- The hyperparameter tuning did not yield a significant improvement in performance. This suggests that KNN may not be the most suitable model for this dataset.

#### **Next Steps:**
- Consider trying another model, such as Decision Trees, Ensemble Methods, or Boosting techniques, to explore better performance and generalization.



<div style="background-color: #4CAF50; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
  Decision Tree
</div>

In [83]:
DT_pipeline=Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modeling', DecisionTreeClassifier())
])

DT_pipeline.fit(x_train,y_train)
train_predictions=DT_pipeline.predict(x_train)
val_predictions=DT_pipeline.predict(x_val)

print('*DT Evaluation*')

r2_dt_train= r2_score(y_train,train_predictions)
accuracy_dt_train = accuracy_score(y_train, train_predictions)
f1_dt_train = f1_score(y_train, train_predictions)

print('r2_dt_train=', r2_dt_train )
print('accuracy_dt_train =', accuracy_dt_train )
print('f1_dt_train =', f1_dt_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_dt_val= r2_score(y_val,val_predictions)
accuracy_dt_val = accuracy_score(y_val, val_predictions)
f1_dt_val = f1_score(y_val, val_predictions)

print('r2_dt_val=', r2_dt_val)
print('accuracy_dt_val =', accuracy_dt_val)
print('f1_dt_val =', f1_dt_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

*DT Evaluation*
r2_dt_train= 1.0
accuracy_dt_train = 1.0
f1_dt_train = 1.0
Train classification report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7424
           1       1.00      1.00      1.00     10432

    accuracy                           1.00     17856
   macro avg       1.00      1.00      1.00     17856
weighted avg       1.00      1.00      1.00     17856

r2_dt_val= 0.035228009509401303
accuracy_dt_val = 0.7670250896057348
f1_dt_val = 0.8036253776435045
Validation classification report
               precision    recall  f1-score   support

           0       0.72      0.71      0.71      1820
           1       0.80      0.80      0.80      2644

    accuracy                           0.77      4464
   macro avg       0.76      0.76      0.76      4464
weighted avg       0.77      0.77      0.77      4464



## Interpretation

### Fitting Condition
The Decision Tree Classifier exhibits a **perfect fit** on the training set with an **R² Score** of **1.0**, explaining **100%** of the variance. However, the **R² Score** on the validation set is only **0.0380**, indicating that the model is **overfitting**. This suggests that while the model captures the training data perfectly, it fails to generalize to unseen data.

### Classification Report
- **Training Set**: Precision, Recall, and F1-Score are all **1.00** for both classes, reflecting perfect classification.
- **Validation Set**: Precision i1 **0.76** for class 0 an0 **0.88** for class 1, with Recall a1 **0.77** an0 **0.81** respectively, showing some missed cases.

### Improving Results
To enhance the model's performance and address overfitting, consider the following strategies:
- **Hyperparameter Tuning**: Adjust parameters such as `max_depth` and `min_samples_split` to find a better balance between bias and ce overfitting.

# Hyperparameter Tuning

In [85]:
DT_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor), 
    ('modeling', DecisionTreeClassifier())
])

# Define the parameter grid 
param_grid = {
    'modeling__max_depth': [None, 5, 10, 15],
    'modeling__min_samples_split': [2, 5, 10],
    'modeling__min_samples_leaf': [1, 2, 4]  
}

# Set up the GridSearchCV
grid_search = GridSearchCV(DT_pipeline, param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(x_train, y_train)

# Get the best parameters and score
print('Best parameters:', grid_search.best_params_)
print('Best cross-validation score:', grid_search.best_score_)

# Evaluate the best model 
best_DT_model = grid_search.best_estimator_
train_predictions = best_DT_model.predict(x_train)
val_predictions = best_DT_model.predict(x_val) 

print('*DT Evaluation*')

r2_t_dt_train= r2_score(y_train,train_predictions)
accuracy_t_dt_train = accuracy_score(y_train, train_predictions)
f1_t_dt_train = f1_score(y_train, train_predictions)

print('r2_t_dt_train=', r2_t_dt_train )
print('accuracy_t_dt_train =', accuracy_t_dt_train )
print('f1_t_dt_train =', f1_t_dt_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_t_dt_val= r2_score(y_val,val_predictions)
accuracy_t_dt_val = accuracy_score(y_val, val_predictions)
f1_t_dt_val = f1_score(y_val, val_predictions)

print('r2_t_dt_val=', r2_t_dt_val)
print('accuracy_t_dt_val =', accuracy_t_dt_val)
print('f1_t_dt_val =', f1_t_dt_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

Best parameters: {'modeling__max_depth': 5, 'modeling__min_samples_leaf': 1, 'modeling__min_samples_split': 2}
Best cross-validation score: 0.8278452182458983
*DT Evaluation*
r2_t_dt_train= 0.31409241458641834
accuracy_t_dt_train = 0.8333893369175627
f1_t_dt_train = 0.8621983417481125
Train classification report
               precision    recall  f1-score   support

           0       0.83      0.75      0.79      7424
           1       0.83      0.89      0.86     10432

    accuracy                           0.83     17856
   macro avg       0.83      0.82      0.83     17856
weighted avg       0.83      0.83      0.83     17856

r2_t_dt_val= 0.2587953649980881
accuracy_t_dt_val = 0.8210125448028673
f1_t_dt_val = 0.8541704690636978
Validation classification report
               precision    recall  f1-score   support

           0       0.81      0.73      0.77      1820
           1       0.83      0.89      0.85      2644

    accuracy                           0.82      4464
  

## Interpretation After Tunning

1. **Model Fit Analysis:**
   - The R² score of 0.31 indicates that the model explains only 31% of the variance in the training data. While the accuracy is relatively high (83%), the gap between precision and recall suggests potential issues with class balance or model complexity.
   - Given that the model performs well on the training set, it may be overfitting, especially if performance on the validation set is significantly lower.

2. **Good Fit vs. Overfit:**
   - The model appears to be a good fit for the training data, but further evaluation on the validation set is necessary to confirm if it generalizes well. If performance drops on the validation set, it could indicate overfitting.

<div style="background-color: #4CAF50; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
  Ensemble Techniques (Random Forest)
</div>

In [87]:
Random_forest_pipeline=Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modeling', RandomForestClassifier())
])

Random_forest_pipeline.fit(x_train,y_train)
train_predictions=Random_forest_pipeline.predict(x_train)
val_predictions=Random_forest_pipeline.predict(x_val)

print('*Random Forest Evaluation*')

r2_rf_train= r2_score(y_train,train_predictions)
accuracy_rf_train = accuracy_score(y_train, train_predictions)
f1_rf_train = f1_score(y_train, train_predictions)

print('r2_rf_train=', r2_rf_train )
print('accuracy_rf_train =', accuracy_rf_train )
print('f1_rf_train =', f1_rf_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_rf_val= r2_score(y_val,val_predictions)
accuracy_rf_val = accuracy_score(y_val, val_predictions)
f1_rf_val = f1_score(y_val, val_predictions)

print('r2_rf_val=', r2_rf_val)
print('accuracy_rf_val =', accuracy_rf_val)
print('f1_rf_val =', f1_rf_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

*Random Forest Evaluation*
r2_rf_train= 1.0
accuracy_rf_train = 1.0
f1_rf_train = 1.0
Train classification report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7424
           1       1.00      1.00      1.00     10432

    accuracy                           1.00     17856
   macro avg       1.00      1.00      1.00     17856
weighted avg       1.00      1.00      1.00     17856

r2_rf_val= 0.3190936144037505
accuracy_rf_val = 0.8355734767025089
f1_rf_val = 0.8644756277695717
Validation classification report
               precision    recall  f1-score   support

           0       0.82      0.76      0.79      1820
           1       0.84      0.89      0.86      2644

    accuracy                           0.84      4464
   macro avg       0.83      0.82      0.83      4464
weighted avg       0.83      0.84      0.83      4464



## Interpretation
### Fit Analysis
- **Training Results**: The model performs perfectly on the training set, indicated by an R² score of 1.0 and perfect classification metrics. This often suggests that the model may be **overfitting** to the training data.
- **Validation Results**: The significantly lower R² score (0.3265) on the validation set, along with the precision, recall, and F1-score values, indicates that the model does not generalize well to unseen data. This reinforces the possibility of **overfitting**.

## Recommendations for Improvement
1. **Hyperparameter Tuning**: Experiment with different hyperparameter settings using techniques like grid search or random search.
2. **Ensemble Methods**: Consider using ensemble methods like bagging or boosting to improve generalization.

<div style="background-color: #4CAF50; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
  Ensemble Techniques (Bagging)
</div>

In [89]:
bagging_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('modeling', BaggingClassifier())  
])

# Fit the model
bagging_pipeline.fit(x_train, y_train)

# Make predictions
train_predictions = bagging_pipeline.predict(x_train)
val_predictions = bagging_pipeline.predict(x_val)

# Evaluation
print('*Bagging Classifier Evaluation*')

r2_bagging_train= r2_score(y_train,train_predictions)
accuracy_bagging_train = accuracy_score(y_train, train_predictions)
f1_bagging_train = f1_score(y_train, train_predictions)

print('r2_rf_train=', r2_bagging_train )
print('accuracy_rf_train =', accuracy_bagging_train )
print('f1_rf_train =', f1_bagging_train )
print('Train classification report\n',classification_report(y_train,train_predictions))

r2_bagging_val= r2_score(y_val,val_predictions)
accuracy_bagging_val = accuracy_score(y_val, val_predictions)
f1_bagging_val = f1_score(y_val, val_predictions)

print('r2_rf_val=', r2_bagging_val)
print('accuracy_rf_val =', accuracy_bagging_val)
print('f1_rf_val =', f1_bagging_val)
print('Validation classification report\n',classification_report(y_val,val_predictions))

*Bagging Classifier Evaluation*
r2_rf_train= 0.9481246363972922
accuracy_rf_train = 0.9873991935483871
f1_rf_train = 0.9891769685891577
Train classification report
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      7424
           1       0.99      0.99      0.99     10432

    accuracy                           0.99     17856
   macro avg       0.99      0.99      0.99     17856
weighted avg       0.99      0.99      0.99     17856

r2_rf_val= 0.25508470349620116
accuracy_rf_val = 0.8201164874551972
f1_rf_val = 0.8488612836438924
Validation classification report
               precision    recall  f1-score   support

           0       0.78      0.77      0.78      1820
           1       0.84      0.85      0.85      2644

    accuracy                           0.82      4464
   macro avg       0.81      0.81      0.81      4464
weighted avg       0.82      0.82      0.82      4464



## Interpretation 
- **Fit Status**: The model shows signs of **overfitting**, as it performs very well on the training data but has lower performance metrics on the validation set.

### Recommendations for Improvement
1. **Hyperparameter Tuning**: Experiment with parameters such as `n_estimators` or `max_samples`.
2. **Increase Data**: Incorporate more diverse training data if possible.
3. **Ensemble Techniques**: Combine with other methods (e.g., Random Forest) to enhance robustness.

<div style="background-color: #3b8d99; color: white; padding: 20px; border-radius: 5px; font-size: 30px; text-align: center; width: fit-content; margin: 0 auto;">
 Comparing The models
    
</div>

In [117]:
import pandas as pd

# Assuming you have your results data as a dictionary
results = {
    'Model': ['Logistic Regression (Train)', 'Logistic Regression (val)', 
              'Decision Tree (Train)', 'Decision Tree (val)', 
              'Tuned DT (train)', 'Tuned DT (val)',
              'KNN (Train)', 'KNN (val)', 
              'Tuned KNN (Train)', 'Tuned KNN (val)', 
              'Random Forest (Train)', 'Random Forest (val)',  
              'Bagging (Train)', 'Bagging (val)'],
    'Accuracy': [accuracy_lr_train, accuracy_lr_val, 
                 accuracy_dt_train, accuracy_dt_val, 
                 accuracy_t_dt_train, accuracy_t_dt_val, 
                 accuracy_knn_train, accuracy_knn_val, 
                 accuracy_t_knn_train, accuracy_t_knn_val, 
                 accuracy_rf_train, accuracy_rf_val, 
                 accuracy_bagging_train, accuracy_bagging_val],
    'F1 Score': [f1_lr_train, f1_lr_val, 
                 f1_dt_train, f1_dt_val, 
                 f1_t_dt_train, f1_t_dt_val, 
                 f1_knn_train, f1_knn_val, 
                 f1_t_knn_train, f1_t_knn_val, 
                 f1_rf_train, f1_rf_val, 
                 f1_bagging_train, f1_bagging_val],
    'R² Score': [r2_lr_train, r2_lr_val, 
                 r2_dt_train, r2_dt_val, 
                 r2_t_dt_train, r2_t_dt_val, 
                 r2_knn_train, r2_knn_val, 
                 r2_t_knn_train, r2_t_knn_val, 
                 r2_rf_train, r2_rf_val, 
                 r2_bagging_train, r2_bagging_val]
}

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Display the DataFrame
pd.set_option('display.float_format', '{:.6f}'.format)  # Set float format for better readability
results_df

Unnamed: 0,Model,Accuracy,F1 Score,R² Score
0,Logistic Regression (Train),0.853663,0.87667,0.397554
1,Logistic Regression (val),0.84543,0.87189,0.359911
2,Decision Tree (Train),1.0,1.0,1.0
3,Decision Tree (val),0.767025,0.803625,0.035228
4,Tuned DT (train),0.833389,0.862198,0.314092
5,Tuned DT (val),0.821013,0.85417,0.258795
6,KNN (Train),0.871528,0.893153,0.471102
7,KNN (val),0.806676,0.842201,0.199425
8,Tuned KNN (Train),0.856855,0.882364,0.410696
9,Tuned KNN (val),0.821013,0.855854,0.258795


# Interpretation

- The best model is: **Logistic Regression** as it appears to be the best fit as it maintains a good balance between training and validation metrics.
- Also, **KNN** performs well without overfitting so, it's also a good choice
   