In [1]:
import sys
sys.path.append("../")  # go to parent directory

from utils import read_xlsx_file

from preprocessor import DataTransformer
from modeling import evaluate_models
from sklearn.model_selection import train_test_split

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [2]:
# read the data 
path_to_train_data = "../data/train_file.xlsx"
df = read_xlsx_file(path_to_train_data)
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,1,failure,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,0,nonexistent,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,0,nonexistent,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,0,nonexistent,no


In [3]:
# df.loc[:, 'contact']

#### Data preprocessing

**Removing some *features* from the dataset:**
* **duration**: This feature is highly correlated with the dependent variable "y". The data suggest that longer contact times are associated with a higher probability of subscribing to a fixed-term deposit. However, the duration of a contact is only known after the contact has been completed and the customer has made his decision. If we want to use this model for predictive inference in production, where predictions need to be made before the contact takes place, including "duration" as a feature is impractical. Therefore, this feature should be excluded from the training data to ensure that the model can be used effectively for real-time prediction.
* **day_of_week**: EDA has shown that this feature does not have a significant impact on the customer"s decision. Given its minimal impact, including it as a feature would not significantly improve the predictive performance of the model. Removing this feature from the training data helps to simplify the model and focus on more important features.
  
**Dealing with unknown categories:** the *"unknown"* categories for such features, such as "job", "education", "default", "housing", "loan" will be removed, as they don"t provide significant predictive value.

**Combining basic education categories:** to simplify the dataset and improve model performance, all basic education categories ("basic.4y", "basic.6y", "basic.9y") are combined into a single, more general category "education.basic". This will reduce the complexity of the education feature and help the model to generalize better by treating all levels of basic education as equivalent.

**Binning age:** given the wide distribution of ages in the dataset, we will split this category into four quantile-based bins. This approach will group the ages into four equally sized bins, which will help to normalize the distribution and potentially improve the performance of the model by reducing the effect of outliers.

**Feature Encoding**: TODO

In [4]:
features_to_remove = ["duration", "day_of_week"]

# hierarchical order for some ordinal features
# [(16.919, 33.2] < (33.2, 49.4] < (49.4, 65.6] < (65.6, 81.8] < (81.8, 98.0]
age_order = ["young", "young_adult", "middle_aged", "late_middle_aged", "old_age"]
education_order = ["illiterate", "education.basic", "high.school", "professional.course", "university.degree"]
month_order = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
poutcome_order = ["nonexistent", "failure", "success"]

# binary mapping definitions
binary_mapping = {"yes": 1, "no": 0}
columns_to_map = ["default", "loan", "housing", "y"]

# initialize Preprocessor
data_transformer = DataTransformer(df,
                                   features_to_remove,
                                   age_order,
                                   education_order,
                                   month_order,
                                   poutcome_order,
                                   binary_mapping,
                                   columns_to_map
                                   )

df = data_transformer.make_preprocess()
df

Unnamed: 0,index,bins_age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
0,1,young_adult,entrepreneur,married,university.degree,0,0,0,1,nov,2,1,failure,0
1,2,old_age,retired,married,education.basic,0,0,0,0,jul,1,0,nonexistent,1
2,4,middle_aged,retired,divorced,university.degree,0,0,0,0,jun,2,0,nonexistent,0
3,6,young,student,single,education.basic,0,0,0,1,aug,1,0,nonexistent,1
4,9,young_adult,admin.,single,high.school,0,1,0,0,jul,5,0,nonexistent,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21175,32904,young_adult,technician,single,professional.course,0,1,0,0,aug,1,0,nonexistent,0
21176,32905,young,management,single,university.degree,0,0,0,0,nov,5,1,failure,0
21177,32906,young_adult,blue-collar,married,high.school,0,1,0,1,jun,3,0,nonexistent,0
21178,32907,young_adult,technician,married,professional.course,0,1,1,0,aug,1,0,nonexistent,0


### Model selection

In [13]:
# Split the data
X = df.drop('y', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 1: Transform data and evaluate models with cross-validation

In [6]:
# define the models
models = {
    "LogisticRegression": LogisticRegression(solver="liblinear", max_iter=1000),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    # "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

eval_results = evaluate_models(X_train, y_train, data_transformer.transformer, models)
eval_results

In [12]:
# best model based on accuracy mean score
best_model_name = max(eval_results, key=lambda model: eval_results[model]["accuracy_mean"])
print(f"Best performed model: '{best_model_name}'") 

Best performed model: 'GradientBoosting'


#### Step 2: Train the model with the best score

In [5]:
# # Optional Step 2: Hyperparameter tuning with Optuna
# perform_hyperparameter_tuning = True
# if perform_hyperparameter_tuning:
#     pipeline = hyperparameter_tuning(preprocessor, X_train, y_train)

# # Step 3: Train the best model
# trained_pipeline = train_best_model(pipeline, X_train, y_train)

# # Save the test data for evaluation script
# X_test.to_csv('X_test.csv', index=False)
# y_test.to_csv('y_test.csv', index=False)
