# Getting the Data

In [1458]:
import pandas as pd
from pathlib import Path

file_name = "bank-additional-full.csv"
bank = pd.read_csv(Path(file_name), sep=";")

# Required Information

Bank client data:
- Age (numeric)
- Job : type of job (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown')
- Marital : marital status (categorical: 'divorced', 'married', 'single', 'unknown' ; note: 'divorced' means divorced or widowed)
- Education (categorical: 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown')
- Default: has credit in default? (categorical: 'no', 'yes', 'unknown')
- Housing: has housing loan? (categorical: 'no', 'yes', 'unknown')
- Loan: has personal loan? (categorical: 'no', 'yes', 'unknown')

Related with the last contact of the current campaign:
- Contact: contact communication type (categorical: 'cellular','telephone')
- Month: last contact month of year (categorical: 'jan', 'feb', 'mar',…, 'nov', 'dec')
- Day_of_week: last contact day of the week (categorical:'mon','tue','wed','thu','fri')
- Duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

Other attributes:
- Campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- Pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
- Previous: number of contacts performed before this campaign and for this client (numeric)
- Poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

Social and economic context attributes
- Emp.var.rate: employment variation rate - quarterly indicator (numeric)
- Cons.price.idx: consumer price index - monthly indicator (numeric)
- Cons.conf.idx: consumer confidence index - monthly indicator (numeric)
- Euribor3m: euribor 3 month rate - daily indicator (numeric)
- Nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target): 
- y - has the client subscribed a term deposit? (binary: 'yes', 'no')

Analysis Steps:
- Atribute information Analysis.
- Machine Learning (Logistic Regression, KNN, SVM, Decision Tree, Random Forest, Naive Bayes)
- Deep Learning (ANN)


In [1459]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [1460]:
bank.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [1461]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [1462]:
bank["y"].value_counts() # More no's than yes'

y
no     36548
yes     4640
Name: count, dtype: int64

# Splitting Data and Preprocessing Pipeline

In [1463]:
from sklearn.model_selection import train_test_split

X = bank.drop(['y', 'duration'], axis=1)
y = bank["y"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [1464]:
X_train.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,40.01742,2.572959,962.745341,0.172989,0.083105,93.576835,-40.514574,3.622698,5167.094049
std,10.435842,2.778138,186.248245,0.492992,1.570011,0.579157,4.622683,1.733735,72.354625
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [1465]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [1466]:
def num_cat(X):
    if X.ndim == 1:
        X = pd.cut(
            X,
            bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, np.inf],
            labels=False,
            include_lowest=True
        ).values.reshape(-1, 1) 
    return X

def num_cat_name(function_transformer, feature_names_in):
    return ["cat"]

def num_cat_pipeline():
    return Pipeline([
        ('functiontransformer', FunctionTransformer(num_cat, feature_names_out=num_cat_name, validate=True)),
        ('ordinal', OrdinalEncoder()),
        ('simple', SimpleImputer(strategy="most_frequent")),
        ('onehot', OneHotEncoder(handle_unknown="ignore")),
    ])


In [1467]:
cat_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('simple', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown="ignore")),
])

num_pipeline = Pipeline([
    ('simple', SimpleImputer(strategy="median")),
    ('standard', StandardScaler()),
])

cat_columns = list(X.select_dtypes(include="object").columns)
num_columns = list(X.select_dtypes(include=["int64", "float64"]).columns)
num_columns_excluding_age = [col for col in num_columns if col != "age"]
num_cat_columns = ["age"]

In [1468]:

preprocessing = ColumnTransformer([
        ('cat', cat_pipeline, cat_columns),
        ('num', num_pipeline, num_columns_excluding_age),
        ('age', num_cat_pipeline(), num_cat_columns),
    ],
    remainder="passthrough"
)

In [1469]:
X_prepared = preprocessing.fit_transform(X_train)

In [1470]:
X_prepared.shape

(32950, 138)

In [1471]:
preprocessing.get_feature_names_out()

array(['cat__job_0.0', 'cat__job_1.0', 'cat__job_2.0', 'cat__job_3.0',
       'cat__job_4.0', 'cat__job_5.0', 'cat__job_6.0', 'cat__job_7.0',
       'cat__job_8.0', 'cat__job_9.0', 'cat__job_10.0', 'cat__job_11.0',
       'cat__marital_0.0', 'cat__marital_1.0', 'cat__marital_2.0',
       'cat__marital_3.0', 'cat__education_0.0', 'cat__education_1.0',
       'cat__education_2.0', 'cat__education_3.0', 'cat__education_4.0',
       'cat__education_5.0', 'cat__education_6.0', 'cat__education_7.0',
       'cat__default_0.0', 'cat__default_1.0', 'cat__default_2.0',
       'cat__housing_0.0', 'cat__housing_1.0', 'cat__housing_2.0',
       'cat__loan_0.0', 'cat__loan_1.0', 'cat__loan_2.0',
       'cat__contact_0.0', 'cat__contact_1.0', 'cat__month_0.0',
       'cat__month_1.0', 'cat__month_2.0', 'cat__month_3.0',
       'cat__month_4.0', 'cat__month_5.0', 'cat__month_6.0',
       'cat__month_7.0', 'cat__month_8.0', 'cat__month_9.0',
       'cat__day_of_week_0.0', 'cat__day_of_week_1.0',
      