Statistical & Machine Learning: 
Individual Assignment 2024

PERAN Mathieu

## Import libraries and csv

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
import numpy as np

In [2]:
df = pd.read_csv("bank_mkt_train.csv")
df.head()

Unnamed: 0,client_id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribe
0,29925,42,management,married,basic.9y,no,no,no,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,0
1,37529,35,unemployed,married,university.degree,no,yes,no,telephone,jun,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.96,5228.1,0
2,2757,44,technician,married,basic.9y,no,yes,yes,cellular,may,...,1,999,0,nonexistent,-1.8,92.893,-46.2,1.264,5099.1,0
3,9642,45,services,married,high.school,no,yes,no,cellular,apr,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.453,5099.1,0
4,14183,45,unknown,married,unknown,unknown,unknown,unknown,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,0


## Data Preprocessing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   client_id       20000 non-null  int64  
 1   age             20000 non-null  int64  
 2   job             20000 non-null  object 
 3   marital         20000 non-null  object 
 4   education       20000 non-null  object 
 5   default         20000 non-null  object 
 6   housing         20000 non-null  object 
 7   loan            20000 non-null  object 
 8   contact         20000 non-null  object 
 9   month           20000 non-null  object 
 10  day_of_week     20000 non-null  object 
 11  campaign        20000 non-null  int64  
 12  pdays           20000 non-null  int64  
 13  previous        20000 non-null  int64  
 14  poutcome        20000 non-null  object 
 15  emp.var.rate    20000 non-null  float64
 16  cons.price.idx  20000 non-null  float64
 17  cons.conf.idx   20000 non-null 

In [4]:
# Separate features and target
X = df.drop('subscribe', axis=1)
y = df['subscribe']



In [13]:
# The dataset is imbalanced 
y.value_counts()

subscribe
0    17729
1     2271
Name: count, dtype: int64

In [5]:
# Identifying categorical columns (assuming 'object' and 'category' dtypes for simplicity)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Define the encoder for categorical features, with drop_first functionality
enc = OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False)

# Fit and transform the categorical data
X_encoded_categorical = enc.fit_transform(X[categorical_cols])

# Create a DataFrame from the encoded attributes
encoded_cols = enc.get_feature_names_out(categorical_cols)
X_encoded_categorical_df = pd.DataFrame(X_encoded_categorical, columns=encoded_cols, index=X.index)

# Drop original categorical columns and concatenate the encoded DataFrame
X_dropped = X.drop(columns=categorical_cols)
X_encoded = pd.concat([X_dropped, X_encoded_categorical_df], axis=1)



In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [7]:
X_encoded

Unnamed: 0,client_id,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,29925,42,1,999,0,1.4,93.918,-42.7,4.968,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,37529,35,4,999,0,1.4,94.465,-41.8,4.960,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2757,44,1,999,0,-1.8,92.893,-46.2,1.264,5099.1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,9642,45,1,999,0,-1.8,93.075,-47.1,1.453,5099.1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,14183,45,1,999,0,1.1,93.994,-36.4,4.859,5191.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,13724,45,1,999,0,1.4,93.918,-42.7,4.957,5228.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
19996,29863,26,2,999,1,-1.8,92.893,-46.2,1.299,5099.1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19997,9841,33,1,999,1,-1.8,92.893,-46.2,1.299,5099.1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19998,36432,43,10,999,0,1.4,93.918,-42.7,4.960,5228.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X_encoded.describe()

Unnamed: 0,client_id,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,...,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,20683.03165,40.052,2.5804,961.1644,0.1714,0.0881,93.577232,-40.472955,3.631806,5167.37764,...,0.3335,0.1009,0.0173,0.01455,0.2097,0.21255,0.1941,0.19735,0.86515,0.0348
std,11898.274235,10.412877,2.79065,190.115383,0.493897,1.573281,0.579869,4.621674,1.731216,72.226178,...,0.471475,0.301204,0.13039,0.119746,0.407105,0.409122,0.395516,0.398009,0.341572,0.183277
min,2.0,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10311.75,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,20761.5,38.0,2.0,999.0,0.0,1.1,93.798,-41.8,4.857,5191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,30993.25,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,41188.0,98.0,56.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Set up baseline for each model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [9]:


# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "GBM": GradientBoostingClassifier(random_state=42),
    "Gaussian Naive Bayes": GaussianNB()
}



In [11]:
# Dictionary to store the AUC scores
model_auc_scores = {}

for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict probabilities for the test data
    y_probs = model.predict_proba(X_test)[:, 1]  # get the probabilities for the positive class
    
    # Calculate the AUC score
    auc = roc_auc_score(y_test, y_probs)
    model_auc_scores[name] = auc

print("Model AUC scores:")
for name, auc in model_auc_scores.items():
    print(f"{name}: {auc}")

Model AUC scores:
Logistic Regression: 0.7514308850214839
Decision Tree: 0.5995273084553258
Random Forest: 0.7431382158538534
GBM: 0.7721890014009158
Gaussian Naive Bayes: 0.7563996112223695


### Define a pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


# Placeholder for the classifier
classifier_placeholder = 'classifier'

# Define the pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectKBest(f_classif)),
    (classifier_placeholder, RandomForestClassifier(random_state=42))  # Example with RandomForest
])


### 1. Logistic Regression

In [None]:
# Define the pipeline
pipeline_lr = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectKBest(f_classif)),
    ('classifier', LogisticRegression(random_state=42))
])

# Parameter grid
param_grid_lr = {
    'feature_selection__k': [10, 20],
    'classifier__C': [0.01, 0.1, 1, 10]
}

# Setup and run the grid search
grid_search_lr = GridSearchCV(pipeline_lr, param_grid=param_grid_lr, cv=StratifiedKFold(5),
                              scoring='roc_auc', n_jobs=-1, verbose=1)


### 2. Decision Tree

In [None]:
# Adjust the pipeline classifier and parameter grid for Decision Tree
pipeline_dt = pipeline_lr.set_params(classifier=DecisionTreeClassifier(random_state=42))
param_grid_dt = {
    'feature_selection__k': [10, 20],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search_dt = GridSearchCV(pipeline_dt, param_grid=param_grid_dt, cv=StratifiedKFold(5),
                              scoring='roc_auc', n_jobs=-1, verbose=1)


### 3. Random Forest

In [None]:
param_grid = {
    'feature_selection__k': [10, 20],  # Number of features to select
    'classifier__n_estimators': [100, 200],  # Number of trees in RandomForest
    'classifier__max_depth': [None, 10],  # Max depth of trees
}

# Setup GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=StratifiedKFold(5),
                           scoring='roc_auc', n_jobs=-1, verbose=1)

### 4. Gradient Bossting Machine (GBM)

In [None]:
# Adjust for GBM
pipeline_gbm = pipeline_lr.set_params(classifier=GradientBoostingClassifier(random_state=42))
param_grid_gbm = {
    'feature_selection__k': [10, 20],
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7]
}

grid_search_gbm = GridSearchCV(pipeline_gbm, param_grid=param_grid_gbm, cv=StratifiedKFold(5),
                               scoring='roc_auc', n_jobs=-1, verbose=1)


### 5. Gaussian Naive Bayes

In [None]:
# Adjust for Gaussian Naive Bayes (note: Naive Bayes may not have many hyperparameters to tune)
pipeline_gnb = pipeline_lr.set_params(classifier=GaussianNB())
param_grid_gnb = {
    'feature_selection__k': [10, 20]
}

grid_search_gnb = GridSearchCV(pipeline_gnb, param_grid=param_grid_gnb, cv=StratifiedKFold(5),
                               scoring='roc_auc', n_jobs=-1, verbose=1)
