In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

# TODO:
# import black
# import jupyter_black
# jupyter_black.load(
#     lab=True,
#     line_length=100,
#     verbosity="INFO",
#     target_version=black.TargetVersion.PY310,
# )

In [2]:
# read the data   # TODO: function
path_to_train_data = "../data/train_file.xlsx"
df = pd.read_excel(path_to_train_data)
df.head() 

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,0,nonexistent,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,1,failure,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,0,nonexistent,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,0,nonexistent,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,0,nonexistent,no


In [3]:
df.drop_duplicates(keep="last", inplace=True)  # remove duplicate

#### Remove some features or their categories

The following *features* will be removed:
* **duration**: This feature is highly correlated with the dependent variable "y". The data suggest that longer contact times are associated with a higher probability of subscribing to a fixed-term deposit. However, the duration of a contact is only known after the contact has been completed and the customer has made his decision. If we want to use this model for predictive inference in production, where predictions need to be made before the contact takes place, including "duration" as a feature is impractical. Therefore, this feature should be excluded from the training data to ensure that the model can be used effectively for real-time prediction.
* **day_of_week**: EDA has shown that this feature does not have a significant impact on the customer"s decision. Given its minimal impact, including it as a feature would not significantly improve the predictive performance of the model. Removing this feature from the training data helps to simplify the model and focus on more important features.

In [4]:
features_to_remove = ["duration", "day_of_week"]
df_adjusted = df.drop(features_to_remove, axis=1)

**Dealing with unknown categories:** the *"unknown"* categories for such features, such as "job", "education", "default", "housing", "loan" will be removed, as they don't provide significant predictive value.

In [5]:
df_adjusted = df_adjusted.query('job != "unknown" & education != "unknown" & default != "unknown" & housing != "unknown"')
df_adjusted.sample(n=3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
19714,48,admin.,married,university.degree,no,no,no,cellular,apr,3,0,nonexistent,no
30658,47,admin.,married,high.school,no,no,yes,telephone,may,1,0,nonexistent,no
25338,36,technician,married,high.school,no,yes,no,telephone,may,1,0,nonexistent,no


**Combining basic education categories:** to simplify the dataset and improve model performance, all basic education categories ("basic.4y", "basic.6y", "basic.9y") are combined into a single, more general category "education.basic". This will reduce the complexity of the education feature and help the model to generalize better by treating all levels of basic education as equivalent.

In [6]:
df_adjusted["education"] = df_adjusted["education"].replace(["basic.4y", "basic.6y", "basic.9y"], "education.basic")
df_adjusted.sample(n=3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
2361,27,admin.,single,high.school,no,no,no,cellular,jul,1,0,nonexistent,no
25327,34,admin.,divorced,university.degree,no,no,no,cellular,nov,2,0,nonexistent,yes
22752,50,technician,married,professional.course,no,no,no,telephone,jun,2,0,nonexistent,no


**Binning age:** given the wide distribution of ages in the dataset, we will split this category into four quantile-based bins. This approach will group the ages into four equally sized bins, which will help to normalize the distribution and potentially improve the performance of the model by reducing the effect of outliers.

In [7]:
bins_nmb = 4
bins_age = pd.qcut(df_adjusted["age"], q=4, labels=[f"age.group_{i+1}" for i in range(bins_nmb)])
df_adjusted.insert(1, "bins_age", bins_age) # Min/Max in each bin: [(16.999, 31.0] < (31.0, 37.0] < (37.0, 45.0] < (45.0, 91.0]]
# remove age column from dataframe
df_adjusted.drop("age", axis=1, inplace=True)

In [8]:
df_adjusted.bins_age.unique()

['age.group_2', 'age.group_4', 'age.group_1', 'age.group_3']
Categories (4, object): ['age.group_1' < 'age.group_2' < 'age.group_3' < 'age.group_4']

##### Encoding categorical features

In [9]:
# hierarchical order for bins_age
age_order = ['age.group_1', 'age.group_2', 'age.group_3', 'age.group_4']
age_encoder = OrdinalEncoder(categories=[age_order])
df_adjusted["bins_age"] = age_encoder.fit_transform(df_adjusted[['bins_age']])

# education encoding
education_order = ['illiterate', 'education.basic', 'high.school', 'professional.course', 'university.degree']
education_encoder = OrdinalEncoder(categories=[education_order])
df_adjusted['education'] = education_encoder.fit_transform(df_adjusted[['education']])

# month encoding
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
month_encoder = OrdinalEncoder(categories=[month_order])
df_adjusted['month'] = month_encoder.fit_transform(df_adjusted[['month']])

# poutcome encoding
poutcome_order = ['nonexistent', 'failure', 'success']
poutcome_encoder = OrdinalEncoder(categories=[poutcome_order])
df_adjusted['poutcome'] = poutcome_encoder.fit_transform(df_adjusted[['poutcome']])

In [10]:
# encoding othe categorical features
label_encoder = LabelEncoder()
df_adjusted["job"] = label_encoder.fit_transform(df_adjusted["job"])
df_adjusted["marital"] = label_encoder.fit_transform(df_adjusted["marital"])
df_adjusted["housing"] = label_encoder.fit_transform(df_adjusted["housing"])
df_adjusted["loan"] = label_encoder.fit_transform(df_adjusted["loan"])
df_adjusted["default"] = label_encoder.fit_transform(df_adjusted["default"])
df_adjusted["y"] = label_encoder.fit_transform(df_adjusted["y"])

In [11]:
df_adjusted

Unnamed: 0,bins_age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
1,1.0,2,1,4.0,0,0,0,telephone,10.0,2,1,1.0,0
2,3.0,5,1,1.0,0,0,0,cellular,6.0,1,0,0.0,1
3,1.0,0,1,4.0,0,1,0,telephone,4.0,2,0,0.0,0
4,3.0,5,0,4.0,0,0,0,cellular,5.0,2,0,0.0,0
5,0.0,0,2,4.0,0,0,0,cellular,7.0,2,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32904,1.0,9,2,3.0,0,1,0,cellular,7.0,1,0,0.0,0
32905,0.0,4,2,4.0,0,0,0,cellular,10.0,5,1,1.0,0
32906,1.0,1,1,2.0,0,1,0,telephone,5.0,3,0,0.0,0
32907,1.0,9,1,3.0,0,1,1,cellular,7.0,1,0,0.0,0


In [12]:
df_adjusted.sample(n=10)

Unnamed: 0,bins_age,job,marital,education,default,housing,loan,contact,month,campaign,previous,poutcome,y
9058,1.0,0,2,2.0,0,1,0,telephone,5.0,3,0,0.0,0
12305,3.0,1,1,1.0,0,1,1,cellular,7.0,3,0,0.0,0
599,0.0,9,1,3.0,0,0,1,cellular,6.0,1,0,0.0,1
21791,1.0,4,1,1.0,0,0,1,cellular,4.0,2,0,0.0,0
25439,0.0,1,1,1.0,0,1,0,telephone,4.0,2,0,0.0,0
31521,3.0,5,0,4.0,0,1,0,cellular,4.0,5,1,1.0,0
24764,1.0,0,1,4.0,0,0,0,cellular,4.0,4,0,0.0,1
19544,0.0,1,2,1.0,0,1,0,cellular,4.0,1,1,1.0,0
10969,1.0,0,1,4.0,0,0,0,telephone,5.0,2,0,0.0,0
20129,2.0,0,1,4.0,0,0,0,cellular,3.0,1,0,0.0,0
