In [2]:
import sys
sys.path.append("../") # go to parent dir

import numpy as np
import pandas as pd
from typing import Dict, List, Any
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn import metrics
from imblearn.over_sampling import SMOTE

# classification algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# metrics
from sklearn.metrics import make_scorer, roc_auc_score, confusion_matrix,precision_score, recall_score, accuracy_score, balanced_accuracy_score, classification_report, precision_recall_curve, roc_curve,f1_score    

import joblib

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from utils import read_xlsx_file

# TODO:
# import black
# import jupyter_black
# jupyter_black.load(
#     lab=True,
#     line_length=100,
#     verbosity="INFO",
#     target_version=black.TargetVersion.PY310,
# )

ModuleNotFoundError: No module named 'utils'

In [2]:
# read the data 
path_to_train_data = "../data/train_file.xlsx"
df = read_xlsx_file(path_to_train_data)
df.head() 

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,1,failure,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,0,nonexistent,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,0,nonexistent,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,0,nonexistent,no


In [3]:
df.drop_duplicates(keep="last", inplace=True)  # remove duplicate

#### Remove some features or their categories

The following *features* will be removed:
* **duration**: This feature is highly correlated with the dependent variable "y". The data suggest that longer contact times are associated with a higher probability of subscribing to a fixed-term deposit. However, the duration of a contact is only known after the contact has been completed and the customer has made his decision. If we want to use this model for predictive inference in production, where predictions need to be made before the contact takes place, including "duration" as a feature is impractical. Therefore, this feature should be excluded from the training data to ensure that the model can be used effectively for real-time prediction.
* **day_of_week**: EDA has shown that this feature does not have a significant impact on the customer"s decision. Given its minimal impact, including it as a feature would not significantly improve the predictive performance of the model. Removing this feature from the training data helps to simplify the model and focus on more important features.

In [4]:
features_to_remove = ["duration", "day_of_week"]
df_adjusted = df.drop(features_to_remove, axis=1)

**Dealing with unknown categories:** the *"unknown"* categories for such features, such as "job", "education", "default", "housing", "loan" will be removed, as they don"t provide significant predictive value.

In [5]:
df_adjusted = df_adjusted.query('job != "unknown" & education != "unknown" & default != "unknown" & housing != "unknown"')
df_adjusted.reset_index(inplace=True)

**Combining basic education categories:** to simplify the dataset and improve model performance, all basic education categories ("basic.4y", "basic.6y", "basic.9y") are combined into a single, more general category "education.basic". This will reduce the complexity of the education feature and help the model to generalize better by treating all levels of basic education as equivalent.

In [6]:
df_adjusted["education"] = df_adjusted["education"].replace(["basic.4y", "basic.6y", "basic.9y"], "education.basic")
df_adjusted.sample(n=3)

Unnamed: 0,index,age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
15650,21245,31,housemaid,single,high.school,no,no,yes,cellular,jul,2,0,nonexistent,no
19463,26366,34,technician,single,professional.course,no,yes,no,cellular,may,11,0,nonexistent,no
3816,5196,34,unemployed,married,university.degree,no,yes,no,telephone,nov,1,0,nonexistent,no


**Binning age:** given the wide distribution of ages in the dataset, we will split this category into four quantile-based bins. This approach will group the ages into four equally sized bins, which will help to normalize the distribution and potentially improve the performance of the model by reducing the effect of outliers.

In [7]:
bins_nmb = 5
age_order = ["young", "young_adult", "middle_aged", "late_middle_aged", "middle_old_age"]
bins_age = pd.cut(df_adjusted["age"], bins=bins_nmb, labels=age_order)
df_adjusted.insert(1, "bins_age", bins_age) # Min/Max in each bin: [(16.926, 31.8] < (31.8, 46.6] < (46.6, 61.4] < (61.4, 76.2] < (76.2, 91.0]]
# remove age column from dataframe
df_adjusted.drop("age", axis=1, inplace=True)

In [8]:
# bins_age

#### Pipeline definition with encoding and scaling categorical and numerical features

In [9]:
# encoding with LabelEncoder
label_encoder = LabelEncoder()
df_adjusted["contact"] = label_encoder.fit_transform(df_adjusted["contact"])

# encoding with binary values
binary_mapping = {"yes": 1, "no": 0}
columns_to_map = ["default", "loan", "housing", "y"]
for column in columns_to_map:
    df_adjusted[column] = df_adjusted[column].map(binary_mapping)

In [10]:
df_adjusted.head()

Unnamed: 0,index,bins_age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
0,1,young_adult,entrepreneur,married,university.degree,0,0,0,1,nov,2,1,failure,0
1,2,middle_old_age,retired,married,education.basic,0,0,0,0,jul,1,0,nonexistent,1
2,3,young_adult,admin.,married,university.degree,0,1,0,1,may,2,0,nonexistent,0
3,4,middle_aged,retired,divorced,university.degree,0,0,0,0,jun,2,0,nonexistent,0
4,5,young,admin.,single,university.degree,0,0,0,0,aug,2,0,nonexistent,0


In [11]:
# hierarchical order for some ordinal features
education_order = ["illiterate", "education.basic", "high.school", "professional.course", "university.degree"]
month_order = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
poutcome_order = ["nonexistent", "failure", "success"]

In [12]:
# define ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # ordinal encoding
        ("bins_age_enc", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[age_order])),
            ('scaler', StandardScaler())
        ]), ['bins_age']),
        ("education", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[education_order])),
            ('scaler', StandardScaler())
        ]), ['education']),
        ("month", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[month_order])),
            ('scaler', StandardScaler())
        ]), ['month']),
        ("poutcome", Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=[poutcome_order])),
            ('scaler', StandardScaler())
        ]), ['poutcome']),
        
        # LabelEncoder was applied separately
        ("contact", "passthrough", ["contact"]),
        
        # binary encoding was applied separately
        ("binary", "passthrough", ["default", "loan", "housing"]),
        
        # One-Hot encoding for job and marital
        ("job_marital", OneHotEncoder(), ["job", "marital"]),
        
        # Standard scaling of the rest numeric features
        ("scaling", StandardScaler(), ["previous", "campaign"])
    ],
    remainder="passthrough"  # leave the other columns unchanged
)

In [13]:
# Split the data into training and test sets
X = df_adjusted.drop("y", axis=1)
y = df_adjusted["y"]

#### Data splitting and balancing

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(f"Training set shape: {X_train.shape} --- {y_train.shape}")
print(f"Testing set shape: {X_test.shape} --- {y_test.shape}")

Training set shape: (21882, 13) --- (21882,)
Testing set shape: (2432, 13) --- (2432,)


#### Model selection

In [15]:
# define the models
models = {
    "LogisticRegression": LogisticRegression(solver="liblinear", max_iter=1000),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVC": SVC(),
    "KNN": KNeighborsClassifier()
}

##### Model evaluation using cross-validation

In [16]:
def show_conf_mtx(y_gold, y_pred, model_name):
    conf_mtx = confusion_matrix(y_gold, y_pred)
    plt.figure(figsize=(4, 2))
    sns.heatmap(conf_mtx, annot=True, cmap="Blues", fmt='d', cbar=False, annot_kws={"fontsize":8})
    plt.title(f'{model_name}', fontsize=10, pad=10)
    plt.xlabel('Predicted', fontsize=8)
    plt.ylabel('True', fontsize=8)
    plt.show()

In [17]:
# models evaluation using cross-validation
def get_mean_val(scores_dict: Dict[str, np.ndarray], val_name: str):
    if val_name not in scores_dict:
        raise KeyError(f"{val_name} is not found in the scores dictionary.")
    return scores_dict[val_name].mean()
    
results = {}
skf = StratifiedKFold(n_splits=5) # StratifiedKFold is used by default for classification tasks

metrics = [
    'accuracy',
    'f1',
    "roc_auc"]

# for model_name, model in models.items():    
#     pipeline = ImbPipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('smote', SMOTE()),  # since the data is very imbalanced, it's better to balance them
#         ('classifier', model)
#     ])

#     cv_scores_dict = {}
#     cv_scores = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=metrics)
#     results[model_name] = {"fit_time_mean": get_mean_val(cv_scores, "fit_time"),
#                            "accuracy_mean": get_mean_val(cv_scores, "test_accuracy"),
#                            "f1_mean": get_mean_val(cv_scores, "test_f1"),
#                            "roc_auc_mean": get_mean_val(cv_scores, "test_roc_auc")
#                           }

In [18]:
# results

In [19]:
# # best model based on F1 score
# best_model_name = max(results, key=lambda model: results[model]["accuracy_mean"])
# best_model_name 

##### Train the best model on the full training data

In [20]:
# pipeline = ImbPipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('smote', SMOTE()),
#     ('classifier', models[best_model_name])
# ])
# pipeline.fit(X_train, y_train)

# # model evaluation on the test date
# y_pred = pipeline.predict(X_test)

# print(classification_report(y_test, y_pred))

In [21]:
##### TODO: remove
best_model_name = "GradientBoosting"

In [22]:
# params = {
#     'learning_rate': trial.suggest_float('learning_rate', 0.08, 1.0),
#     'n_estimators': trial.suggest_int('n_estimators', 300, 500),
#     'max_depth':trial.suggest_int('max_depth', 3, 7),
#     'max_features': trial.suggest_int('max_features', 5, 15),
#     'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
#     'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
#     'random_state': 42
#   }

# feature_name = "min_samples_split"


# train_results = []
# test_results = []
# for min_samples_split in min_samples_split_l:
#     model = GradientBoostingClassifier(learning_rate=1.0,
#                                        n_estimators=500,
#                                        max_depth=6,
#                                        max_features=15,
#                                       )
#     # Define the pipeline with SMOTE and the preprocessor
#     pipeline = ImbPipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('smote', SMOTE()),
#         ('classifier', model)
#         ])
    
#     pipeline.fit(X_train, y_train)
#     train_pred = pipeline.predict(X_train)
#     roc_auc = roc_auc_score(y_train, train_pred)
#     train_results.append(roc_auc)
    
#     y_pred = pipeline.predict(X_test)
#     roc_auc = roc_auc_score(y_test, y_pred)
#     test_results.append(roc_auc)

# from matplotlib.legend_handler import HandlerLine2D
# line1, = plt.plot(max_features_l, train_results, 'b', label="Train AUC")
# line2, = plt.plot(max_features_l, test_results, 'r', label="Test AUC")
# plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
# plt.ylabel('AUC score')
# plt.xlabel(feature_name)
# plt.show()


##### Hyperparameter tuning of the best model

In [6]:
# predifine a set of hyperparameters for the models
models_params = {
    "LogisticRegression":  {
        'C': trial.suggest_float('C', 0.01, 0.1), # Regularization parameter, controlling the trade-off between maximizing the margin and minimizing classification error
        'solver': trial.suggest_categorical('solver', ["liblinear", "newton-cholesky"])
    },
    "DecisionTreeClassifier": {
        'splitter': trial.suggest_categorical('splitter', ["best", "random"]),
        'max_depth': trial.suggest_int('max_depth', 1, 4),
        'class_weight': trial.suggest_categorical('class_weight', [None, "balanced"])
    },
    "RandomForest": {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 4),
    },
    "GradientBoosting": {
        'learning_rate': trial.suggest_float('learning_rate', 0.08, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'random_state': 42
    },
    "SVC": {
        'C': trial.suggest_float('C', 0.1, 1.5),
        'gamma': trial.suggest_categorical('gamma', ["scale", "auto"]),
        'kernel':  trial.suggest_categorical('kernel', ["linear", "poly", "rbf"])
    },
    "KNN": {
        'n_neighbors': trial.suggest_int('n_neighbors', 2, 19),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['l2', 'manhattan', 'cosine'])
    }
}

NameError: name 'trial' is not defined

In [None]:
# hyperparemeter tuning with optuna
import optuna

def objective(trial):
     # Define hyperparameters to tune
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.08, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'random_state': 42
      }


    # Create the model with trial parameters
    model = GradientBoostingClassifier(**params)

    # Define the pipeline with SMOTE and the preprocessor
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE()),
        ('classifier', model)
    ])
    
   
    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1)
    mean_cv_score = cv_scores.mean()
    
    # Report intermediate values for pruning
    trial.report(mean_cv_score, step=0)
    
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
    
    # Return the mean of the cross-validation scores
    return mean_cv_score

# Set up the Optuna study
n_trials=500
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///db.sqlite3",
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5))
study.optimize(objective, n_trials=n_trials)

# Print the best hyperparameters
print(f'Best hyperparameters: {study.best_params}')

# Train the final model with the best hyperparameters
best_params = study.best_params
best_model = GradientBoostingClassifier(
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42
)


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-06-03 16:39:10,583] A new study created in RDB with name: no-name-2680ea83-ac70-4759-9f58-9d3a066e5f66
[I 2024-06-03 16:39:25,415] Trial 0 finished with value: 0.31204701246073996 and parameters: {'learning_rate': 0.33362967435484786, 'n_estimators': 165}. Best is trial 0 with value: 0.31204701246073996.
[I 2024-06-03 16:39:47,637] Trial 1 finished with value: 0.3035797206166133 and parameters: {'learning_rate': 0.1462565578756947, 'n_estimators': 271}. Best is trial 0 with value: 0.31204701246073996.
[I 2024-06-03 16:40:24,550] Trial 2 finished with value: 0.30862796972569456 and parameters: {'learning_rate': 0.6166439999659143, 'n_estimators': 448}. Best is trial 0 with value: 0.31204701246073996.
[I 2024-06-03 16:40:33,779] Trial 3 finished with value: 0.30667817241357087 and parameters: {'learning_rate': 0.38532553764890204, 'n_estimators': 103}. Best is trial 0 with value: 0.31204701246073996.
[I 2024-06-03 16:40:45,676] Tr

In [None]:
# Define the pipeline with the best model
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE()),
    ('classifier', best_model)
])

# Train the model on the full training set
pipeline.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# # get the parameters for tuning based on best model
# params_to_tune = models_params[best_model_name]  # Change this to the parameter grid for the chosen model
# print(f"Following parameters of the '{best_model_name}' model will be tuned:\n{params_to_tune}")

# # hyperparameter tuning with GridSearchCV
# pipeline = ImbPipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('smote', SMOTE()),
#     ('classifier', models[best_model_name])
# ])

# grid_search = GridSearchCV(pipeline, params_to_tune, cv=skf, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# print(f'Best parameters of {best_model_name} model: {grid_search.best_params_}')
# print(f'Best CV F1 Score: {grid_search.best_score_}')

# # Train the best model on the full training set with the best parameters
# best_model = grid_search.best_estimator_
# best_model.fit(X_train, y_train)

# # Evaluate the best model on the test set
# y_pred = best_model.predict(X_test)


# # Output the test predictions and evaluation
# print(classification_report(y_test, y_pred))

In [None]:
def compute_metrics_scores(y_target, y_predicted) -> Dict[str, Any]:
    metric_types = [accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score]
    metric_scores_dict = dict()
    for metric_class in metric_types:
        metric_name = metric_class.__name__
        metric = metric_class(y_target, y_predicted)  # initialize metric
        metric_scores_dict[f"{metric_name}"] = metric
    return metric_scores_dict

metric_scores = compute_metrics_scores(y_test, y_pred)
metric_scores

In [None]:
# Generate ROC curve and Precision-Recall curve
false_positive_rate, true_positive_rate, _ = roc_curve(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)
precision, recall, _ = precision_recall_curve(y_test, y_pred)

In [None]:
# Plot ROC curve
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(false_positive_rate, true_positive_rate, label=f'ROC Curve area (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')

In [None]:
# Plot Precision-Recall curve
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()


In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#### Persist the model and preprocessor and make predictions

In [None]:
# Persist the trained model and preprocessor
joblib.dump(pipeline, 'trained_model_pipeline.pkl')

# Load the model and preprocessor for future use
loaded_pipeline = joblib.load('trained_model_pipeline.pkl')

In [None]:
# read the data   # TODO: function
path_to_test_data = "../data/test_file.xlsx"
df_test_data = pd.read_excel(path_to_test_data)
df_test_data

In [None]:
df_test_data = df_test_data.drop(features_to_remove, axis=1)
df_test_data.query('job != "unknown" & education != "unknown" & default != "unknown" & housing != "unknown"', inplace=True)
df_test_data.reset_index(inplace=True)
df_test_data["education"] = df_test_data["education"].replace(["basic.4y", "basic.6y", "basic.9y"], "education.basic")

In [None]:
bins_age = pd.cut(df_test_data["age"], bins=bins_nmb, labels=age_order)
df_test_data.insert(1, "bins_age", bins_age)
# remove age column from dataframe
df_test_data.drop("age", axis=1, inplace=True)
df_test_data

In [None]:
# encoding with LabelEncoder
label_encoder = LabelEncoder()
df_test_data["contact"] = label_encoder.fit_transform(df_test_data["contact"])

# encoding with binary values
binary_mapping = {"yes": 1, "no": 0}
columns_to_map = ["default", "loan", "housing"]  # no y
for column in columns_to_map:
    df_test_data[column] = df_test_data[column].map(binary_mapping)

In [None]:
df_test_data

In [None]:
# Load the model and preprocessor for future use
loaded_pipeline = joblib.load('trained_model_pipeline.pkl')

# Use the loaded model to make predictions on new data
new_predictions = loaded_pipeline.predict(df_test_data)
print(new_predictions)

In [None]:
for pred in y_pred:
    if pred == 1:
        print(f"{y_pred.index(pred)}: {pred}")