In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn import metrics
from imblearn.over_sampling import SMOTE

# classification algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# metrics
from sklearn.metrics import roc_auc_score, confusion_matrix,precision_score, recall_score, accuracy_score, balanced_accuracy_score, classification_report,roc_curve,f1_score    



# TODO:
# import black
# import jupyter_black
# jupyter_black.load(
#     lab=True,
#     line_length=100,
#     verbosity="INFO",
#     target_version=black.TargetVersion.PY310,
# )

In [2]:
# read the data   # TODO: function
path_to_train_data = "../data/train_file.xlsx"
df = pd.read_excel(path_to_train_data)
df.head() 

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,1,failure,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,0,nonexistent,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,0,nonexistent,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,0,nonexistent,no


In [3]:
df.drop_duplicates(keep="last", inplace=True)  # remove duplicate

#### Remove some features or their categories

The following *features* will be removed:
* **duration**: This feature is highly correlated with the dependent variable "y". The data suggest that longer contact times are associated with a higher probability of subscribing to a fixed-term deposit. However, the duration of a contact is only known after the contact has been completed and the customer has made his decision. If we want to use this model for predictive inference in production, where predictions need to be made before the contact takes place, including "duration" as a feature is impractical. Therefore, this feature should be excluded from the training data to ensure that the model can be used effectively for real-time prediction.
* **day_of_week**: EDA has shown that this feature does not have a significant impact on the customer"s decision. Given its minimal impact, including it as a feature would not significantly improve the predictive performance of the model. Removing this feature from the training data helps to simplify the model and focus on more important features.

In [4]:
features_to_remove = ["duration", "day_of_week"]
df_adjusted = df.drop(features_to_remove, axis=1)

**Dealing with unknown categories:** the *"unknown"* categories for such features, such as "job", "education", "default", "housing", "loan" will be removed, as they don"t provide significant predictive value.

In [5]:
df_adjusted = df_adjusted.query('job != "unknown" & education != "unknown" & default != "unknown" & housing != "unknown"')

**Combining basic education categories:** to simplify the dataset and improve model performance, all basic education categories ("basic.4y", "basic.6y", "basic.9y") are combined into a single, more general category "education.basic". This will reduce the complexity of the education feature and help the model to generalize better by treating all levels of basic education as equivalent.

In [6]:
df_adjusted["education"] = df_adjusted["education"].replace(["basic.4y", "basic.6y", "basic.9y"], "education.basic")
df_adjusted.sample(n=3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
25043,35,technician,single,high.school,no,yes,no,cellular,aug,2,0,nonexistent,no
63,36,unemployed,married,university.degree,no,no,no,cellular,jul,2,0,nonexistent,no
17494,33,admin.,married,university.degree,no,no,no,cellular,jul,1,0,nonexistent,no


**Binning age:** given the wide distribution of ages in the dataset, we will split this category into four quantile-based bins. This approach will group the ages into four equally sized bins, which will help to normalize the distribution and potentially improve the performance of the model by reducing the effect of outliers.

In [7]:
bins_nmb = 4
age_order = ["young", "young_adult", "middle_aged", "late_middle_aged"]
bins_age = pd.qcut(df_adjusted["age"], q=4, labels=age_order)
df_adjusted.insert(1, "bins_age", bins_age) # Min/Max in each bin: [(16.999, 31.0] < (31.0, 37.0] < (37.0, 45.0] < (45.0, 91.0]]
# remove age column from dataframe
df_adjusted.drop("age", axis=1, inplace=True)

In [8]:
# df_adjusted.bins_age.unique()

#### Pipeline definition with encoding and scaling categorical and numerical features

In [9]:
# encoding with LabelEncoder
label_encoder = LabelEncoder()
df_adjusted["contact"] = label_encoder.fit_transform(df_adjusted["contact"])

# encoding with binary values
binary_mapping = {"yes": 1, "no": 0}
columns_to_map = ["default", "loan", "housing", "y"]
for column in columns_to_map:
    df_adjusted[column] = df_adjusted[column].map(binary_mapping)

In [10]:
# df_adjusted.head()

In [11]:
# hierarchical order for some ordinal features
education_order = ["illiterate", "education.basic", "high.school", "professional.course", "university.degree"]
month_order = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
poutcome_order = ["nonexistent", "failure", "success"]

In [12]:
# define ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # ordinal encoding
        ("bins_age_enc", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[age_order])),
            ('scaler', StandardScaler())
        ]), ['bins_age']),
        ("education", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[education_order])),
            ('scaler', StandardScaler())
        ]), ['education']),
        ("month", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[month_order])),
            ('scaler', StandardScaler())
        ]), ['month']),
        ("poutcome", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[poutcome_order])),
            ('scaler', StandardScaler())
        ]), ['poutcome']),
        
        # LabelEncoder will be applied separately
        ("contact", "passthrough", ["contact"]),
        
        # binary encoding will be applied separately
        ("binary", "passthrough", ["default", "loan", "housing"]),
        
        # One-Hot encoding for job and marital
        ("job_marital", OneHotEncoder(), ["job", "marital"]),
        
        # Standard scaling of the rest numeric features
        ("scaling", StandardScaler(), ["previous", "campaign"])
    ],
    remainder="passthrough"  # This will leave the other columns unchanged
)

In [13]:
# Split the data
X = df_adjusted.drop("y", axis=1)
y = df_adjusted["y"]

# Define the pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])

# Fit and transform the data
X_preprocessed = pipeline.fit_transform(X)
X_preprocessed

array([[-0.4065239 ,  1.18919997,  2.01043812, ...,  0.        ,
         1.51481141, -0.18901436],
       [ 1.37650524, -1.29334559,  0.13653145, ...,  0.        ,
        -0.37415795, -0.56005385],
       [-0.4065239 ,  1.18919997, -0.80042189, ...,  0.        ,
        -0.37415795, -0.18901436],
       ...,
       [-0.4065239 , -0.4658304 , -0.33194522, ...,  0.        ,
        -0.37415795,  0.18202513],
       [-0.4065239 ,  0.36168479,  0.60500812, ...,  0.        ,
        -0.37415795, -0.56005385],
       [ 1.37650524, -1.29334559, -0.33194522, ...,  0.        ,
        -0.37415795, -0.18901436]])

#### Data splitting and balancing

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y,test_size=0.1)
print(f"Training set shape: {X_train.shape} --- {y_train.shape}")
print(f"Testing set shape: {X_test.shape} --- {y_test.shape}")

Training set shape: (21882, 25) --- (21882,)
Testing set shape: (2432, 25) --- (2432,)


##### Since the target class is very imbalanced, it would be better to balance it before training. 

In [16]:
smt = SMOTE(sampling_strategy="minority", random_state=42)
X_train, y_train = smt.fit_resample(X_train, y_train)
print(f"Resampled training dataset shape: {X_train.shape} --- {y_train.shape}")

Resampled training dataset shape: (38260, 25) --- (38260,)


#### Model selection

In [18]:
# define the models
models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

In [19]:
# models evaluation using cross-validation
skf = StratifiedKFold(n_splits=5)  # StratifiedKFold is better for unbalanced dataset
for model_name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])
    cv_scores = cross_val_score(pipeline, X, y, cv=skf, scoring="f1")
    print(f"{model_name} --- {cv_scores} --- CV F1 Score: {cv_scores.mean()}")

LogisticRegression --- [0.15472779 0.17039106 0.17440225 0.1492109  0.21921516] --- CV F1 Score: 0.1735894329979696
DecisionTreeClassifier --- [0.29704797 0.29636364 0.28216504 0.27824463 0.30728709] --- CV F1 Score: 0.29222167438004365
RandomForest --- [0.29082774 0.30618893 0.28886439 0.29533116 0.33264463] --- CV F1 Score: 0.3027713687092126
GradientBoosting --- [0.29648241 0.32889964 0.2993865  0.27272727 0.32009627] --- CV F1 Score: 0.30351841893057185
SVM --- [0.31313131 0.32512315 0.27930175 0.28423773 0.31812577] --- CV F1 Score: 0.3039839416456577
KNN --- [0.30900243 0.30822711 0.27885714 0.25899281 0.29966704] --- CV F1 Score: 0.2909493066089281


In [None]:


# Define evaluation metrics
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [17]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline

# numeric_features = ["bins_age", "previous", "campaign"]
# fig, axs = plt.subplots(2, 2, figsize=(12, 6))
# axs = axs.flatten()
# for i, feature in enumerate(numeric_features):
#     sns.histplot(data=df_adjusted, x=feature, discrete=True, ax=axs[i], kde=True)
#     axs[i].grid(True)

# # to prevent overlap
# plt.tight_layout()