# Classification task of predicting if the client will subscribe a term deposit

## Imports

In [14]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, KMeansSMOTE
from imblearn.pipeline import Pipeline as imblearn_Pipeline

pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:,.2f}'.format)

%load_ext autoreload
%autoreload all
from process_bank_deposit import (get_train_test_data, get_preprocessor,
                                  get_auc, get_confusion_matrix)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the data

In [2]:
raw_data = pd.read_csv('./data/bank_data.csv')

In [5]:
data_dict = get_train_test_data(raw_data, 'y')

train_inputs, train_targets = data_dict['train_inputs'], data_dict['train_targets']
test_inputs, test_targets = data_dict['test_inputs'], data_dict['test_targets']

## Resampling Methods

Let's try to apply resampling methods for balanced our dataset. For this purpose we need to apply resampling only on train dataset and not on validation or test sets.

In [8]:
# calculate scale_pos_weight
scale_pos_weight = np.round(
    ((len(train_targets) - sum(train_targets)) / sum(train_targets)), 3)
scale_pos_weight

7.876

In [17]:
def get_resample_eval_results(resampling_method, preprocessor, clf, inputs, targets):
    """
    Evaluate the results of resampling using AUROC
    """
    # dictionary for saving results
    results_dict = {'Resampling Method': str(resampling_method).split('(')[0],
                    'Classifier': str(clf).split('(')[0]}

    model_pipeline = imblearn_Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('resampling', resampling_method),
        ('classifier', clf)
    ])
    auc = cross_validate(model_pipeline,
                         X=inputs,
                         y=targets,
                         scoring='roc_auc',
                         cv=3,
                         return_train_score=True)
    train_score = np.mean(auc['train_score'])
    val_score = np.mean(auc['test_score'])
    results_dict['AUROC on train'] = train_score
    results_dict['AUROC on validation'] = val_score
    return results_dict

In [33]:
log_reg = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
knn = KNeighborsClassifier(n_neighbors=15)
dt = DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced')
rf = RandomForestClassifier(max_depth=5, random_state=42, class_weight='balanced')

xgb = XGBClassifier(n_estimators=50,
                    max_depth=3,
                    learning_rate=0.1,
                    scale_pos_weight=scale_pos_weight,
                    random_state=42)

lgb = LGBMClassifier(n_estimators=50,
                     max_depth=7,
                     learning_rate=0.03,
                     scale_pos_weight=scale_pos_weight,
                     random_state=42,
                     verbose=0)

In [22]:
resampling_methods = [
    ADASYN(random_state=42),
    SVMSMOTE(random_state=42),
    SMOTEENN(random_state=42),
    SMOTETomek(random_state=42)
]

In [34]:
results = []

for clf in [log_reg, knn]:
    for method in resampling_methods:
        preprocessor = get_preprocessor(train_inputs, tree_base_model=False)
        res = get_resample_eval_results(method, preprocessor, clf,
                                        train_inputs, train_targets)
        results.append(res)

for clf in [dt, rf, xgb, lgb]:
    for method in resampling_methods:
        preprocessor = get_preprocessor(train_inputs, tree_base_model=True)
        res = get_resample_eval_results(method, preprocessor, clf,
                                        train_inputs, train_targets)
        results.append(res)

In [25]:
pd.DataFrame(results).style.background_gradient(
    subset=['AUROC on validation'])

Unnamed: 0,Resampling Method,Classifier,AUROC on train,AUROC on validation
0,ADASYN,LogisticRegression,0.792682,0.786038
1,SVMSMOTE,LogisticRegression,0.793194,0.788559
2,SMOTEENN,LogisticRegression,0.794866,0.789396
3,SMOTETomek,LogisticRegression,0.79332,0.787523
4,ADASYN,KNeighborsClassifier,0.923395,0.709011
5,SVMSMOTE,KNeighborsClassifier,0.877301,0.741219
6,SMOTEENN,KNeighborsClassifier,0.876897,0.734995
7,SMOTETomek,KNeighborsClassifier,0.909308,0.723554
8,ADASYN,DecisionTreeClassifier,0.750485,0.739681
9,SVMSMOTE,DecisionTreeClassifier,0.776253,0.771757


**Model performance analysis and conclusion**
- After applying different resampling methods, the AUROC scores decreased for all models compared to the original imbalanced dataset.
- Every model now has an AUROC ≤ 0.789, which is lower than the results before resampling.
- `LogisticRegression` consistently performs worse across all resampling methods.
- Among others classifiers, `SVMSMOTE` produced the best results, but still performed worse than models trained on the original dataset without balancing. 

**Possible reasons for performance drop:**
- Resampling methods generate synthetic samples for the minority class, but these samples may not accurately represent real-world distributions.
- The models may struggle to generalize because synthetic data introduce noise, making classification harder.