In [1]:
import sys
sys.path.append("../")  # go to src directory

from src.utils import read_xlsx, create_logging

from src.data_transformer import DataTransformer
from src.modeling import evaluate_models, train_best_model, save_best_model
from sklearn.model_selection import train_test_split

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from src.model_optimizer import Optimizer

In [2]:
create_logging()

In [3]:
# read the data 
path_to_train_data = "../data/train_file.xlsx"
df = read_xlsx(path_to_train_data)
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,1,failure,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,0,nonexistent,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,0,nonexistent,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,0,nonexistent,no


#### Data preprocessing

**Removing some *features* from the dataset:**
* **duration**:
* This feature is highly correlated with the target variable "y", indicating that longer contact times are associated with a higher probability of subscribing to a fixed-term deposit. However, the duration of a contact is only known after the contact has been completed and the customer has made their decision. For predictive inference in production, where predictions need to be made before the contact occurs, including "duration" as a feature is impractical. Therefore, this feature should be excluded from the training data to ensure the model can be effectively used for real-time prediction.

* **day_of_week**:
* EDA has shown that this feature does not have a significant impact on the customer"s decision. Given its minimal impact, including it as a feature would not significantly improve the predictive performance of the model. Removing this feature from the training data helps to simplify the model and focus on more important features.
  
**Dealing with unknown categories:** 
* Categories labeled as "unknown" in features such as "job", "education", "default", "housing", and "loan" will be removed. These categories do not provide significant predictive value and their removal helps in cleaning the dataset.

**Combining basic education categories:**
* To simplify the dataset and improve model performance, all basic education categories ("basic.4y", "basic.6y", "basic.9y") are combined into a single, more general category "education.basic". This reduces the complexity of the education feature and helps the model generalize better by treating all levels of basic education as equivalent.

**Binning age:** 
* Given the wide distribution of ages in the dataset, this category will be split into four quantile-based bins. This approach groups ages into four equally sized bins, helping to normalize the distribution and potentially improve model performance by reducing the effect of outliers.

**Feature Encoding**:
* _**Ordinal features**_:
  * **age**: Encoded into quantile-based bins and then transformed using OrdinalEncoder.
  * **education**: Combined into "education.basic" and encoded using OrdinalEncoder.
  * **month**: Encoded using OrdinalEncoder to reflect the natural order of months.

* _**Categorical features**_:
    * **job**: Encoded using OneHotEncoder to create binary columns for each job category.
    * **default**, **loan**, **housing**, and for training data **y**: Encoded using binary mapping (e.g., "yes" = 1, "no" = 0).

In [4]:
# initialize Preprocessor
data_transformer = DataTransformer(df)

# preprocess some data
df = data_transformer.clean_and_transform()
df

Unnamed: 0,bins_age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
1,young_adult,entrepreneur,married,university.degree,0,0,0,1,nov,2,1,failure,0
2,old_age,retired,married,education.basic,0,0,0,0,jul,1,0,nonexistent,1
3,young_adult,admin.,married,university.degree,0,1,0,1,may,2,0,nonexistent,0
4,middle_aged,retired,divorced,university.degree,0,0,0,0,jun,2,0,nonexistent,0
5,young,admin.,single,university.degree,0,0,0,0,aug,2,0,nonexistent,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32903,middle_aged,technician,married,education.basic,0,0,0,1,may,5,0,nonexistent,0
32905,young,management,single,university.degree,0,0,0,0,nov,5,1,failure,0
32906,young_adult,blue-collar,married,high.school,0,1,0,1,jun,3,0,nonexistent,0
32907,young_adult,technician,married,professional.course,0,1,1,0,aug,1,0,nonexistent,0


### Model selection

In [5]:
# define the models
models_list = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    # 'SVC': SVC(),
    'KNN': KNeighborsClassifier()
}

In [6]:
# Split the data
X = df.drop('y', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 1: Transform data and evaluate models with cross-validation

In [None]:
eval_results = evaluate_models(X_train, y_train, data_transformer, models_list)
eval_results

In [None]:
# best model based on accuracy mean score
best_model_name = max(eval_results, key=lambda model: eval_results[model]["f1_mean"])
print(f"Best performed model: '{best_model_name}'") 

#### Step 2: Train the model with the best score

In [None]:
%%time
model_pipeline = train_best_model(models_list,
                best_model_name,
                data_transformer,
                X_train,
                y_train)

# TODO: add visualisation

#### Step 3: Persist final model

In [None]:
# save_best_model(model_pipeline)

#### Optional Step: Hyperparameter tuning with Optuna

In [None]:
# predifine a set of hyperparameters for the models
models_params = {
    "LogisticRegression":  {
        'C': [0.01, 0.1], # Regularization parameter, controlling the trade-off between maximizing the margin and minimizing classification error
        'solver': ["liblinear", "newton-cholesky"],
        'random_state': 42
    },
    "DecisionTreeClassifier": {
        'splitter': ["best", "random"],
        'max_depth':  [1, 4],
        'class_weight': [None, "balanced"],
        'random_state': 42
    },
    "RandomForestClassifier": {
        'n_estimators': [100, 500],
        'max_depth': [1, 4],
        'random_state': 42
    },
    "GradientBoostingClassifier": {
        'learning_rate': [0.08, 1.0],
        'n_estimators': [100, 500],
        'random_state': 42
    },
    "SVC": {
        'C': [0.1, 1.5],
        'gamma': ["scale", "auto"],
        'kernel': ["linear", "poly", "rbf"],
        'random_state': 42
    },
    "KNN": {
        'n_neighbors': [2, 19],
        'weights': ['uniform', 'distance'],
        'metric': ['l2', 'manhattan', 'cosine']
    }
}

In [None]:
%%time
optimizer = Optimizer("GradientBoosting", models, models_params, n_trials=200)
params = optimizer.hyperparameter_tuning(data_transformer, X_train, y_train)