Statistical & Machine Learning: 
Individual Assignment 2024

PERAN Mathieu

## Import libraries and csv

In [58]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, KFold, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [59]:
df = pd.read_csv("bank_mkt_train.csv")
df.head()

Unnamed: 0,client_id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribe
0,29925,42,management,married,basic.9y,no,no,no,cellular,jul,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,0
1,37529,35,unemployed,married,university.degree,no,yes,no,telephone,jun,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.96,5228.1,0
2,2757,44,technician,married,basic.9y,no,yes,yes,cellular,may,...,1,999,0,nonexistent,-1.8,92.893,-46.2,1.264,5099.1,0
3,9642,45,services,married,high.school,no,yes,no,cellular,apr,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.453,5099.1,0
4,14183,45,unknown,married,unknown,unknown,unknown,unknown,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,0


## Data Preprocessing

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   client_id       20000 non-null  int64  
 1   age             20000 non-null  int64  
 2   job             20000 non-null  object 
 3   marital         20000 non-null  object 
 4   education       20000 non-null  object 
 5   default         20000 non-null  object 
 6   housing         20000 non-null  object 
 7   loan            20000 non-null  object 
 8   contact         20000 non-null  object 
 9   month           20000 non-null  object 
 10  day_of_week     20000 non-null  object 
 11  campaign        20000 non-null  int64  
 12  pdays           20000 non-null  int64  
 13  previous        20000 non-null  int64  
 14  poutcome        20000 non-null  object 
 15  emp.var.rate    20000 non-null  float64
 16  cons.price.idx  20000 non-null  float64
 17  cons.conf.idx   20000 non-null 

In [61]:
# Separate features and target
X = df.drop('subscribe', axis=1)
y = df['subscribe']



In [62]:
# The dataset is imbalanced (only 11% of positive class in the DV)
y.value_counts(normalize=True)

subscribe
0    0.88645
1    0.11355
Name: proportion, dtype: float64

In [63]:
# Select categorical and numerical columns excluding 'client_id'
categorical_cols = [col for col in X.select_dtypes(include=['object', 'category']).columns if col != 'client_id']
numerical_cols = [col for col in X.select_dtypes(exclude=['object', 'category']).columns if col != 'client_id']


In [64]:

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('selectkbest', SelectKBest(f_classif, k='all'))  # keep all as we have low number of numeric features
])

# Preprocessing for categorical data with feature selection
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Define the cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model pipeline, including SMOTE
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    # Model placeholder
])

# Define the models to be evaluated
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "GBM": GradientBoostingClassifier(random_state=42),
    "Gaussian Naive Bayes": GaussianNB()
}




In [65]:
# Assuming X_train includes the 'client_id' column

# Fit the preprocessor on the training data excluding 'client_id'
X_train_preprocessed = preprocessor.fit_transform(X_train.drop('client_id', axis=1), y_train)

# Extract feature names from the numerical transformer
num_features_indices = preprocessor.named_transformers_['num']['selectkbest'].get_support(indices=True)
num_feature_names = [numerical_cols[i] for i in num_features_indices]

# Attempt to extract feature names for the categorical features after feature selection
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out()
selected_cat_features_indices = preprocessor.named_transformers_['cat']['feature_selection'].get_support(indices=True)
selected_cat_feature_names = [cat_feature_names[i] for i in selected_cat_features_indices]

# Combine all feature names
all_feature_names = num_feature_names + selected_cat_feature_names

# Create a DataFrame with the correct feature names and include client_id from the original data
basetable = pd.DataFrame(X_train_preprocessed, columns=all_feature_names)
basetable['client_id'] = X_train['client_id'].reset_index(drop=True)
basetable['target'] = y_train.reset_index(drop=True)

# Set 'client_id' as the index of the DataFrame
basetable.set_index('client_id', inplace=True)

# Export the basetable to a CSV file
basetable.to_csv('final_basetable.csv')


In [66]:
X_train

Unnamed: 0,client_id,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
11717,21641,36,admin.,single,high.school,no,no,yes,telephone,may,fri,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
14410,35922,26,admin.,single,high.school,no,no,no,telephone,jun,wed,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1
15360,2950,35,technician,married,university.degree,no,no,no,cellular,jul,fri,1,999,0,nonexistent,1.4,93.918,-42.7,4.957,5228.1
12887,12347,24,admin.,single,high.school,no,no,no,cellular,jul,mon,1,14,1,success,-1.7,94.215,-40.3,0.827,4991.6
7591,22893,49,admin.,married,high.school,no,yes,no,cellular,nov,fri,2,999,0,nonexistent,-0.1,93.200,-42.0,4.021,5195.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963,38535,34,admin.,married,high.school,no,no,no,cellular,jun,mon,1,999,0,nonexistent,-2.9,92.963,-40.8,1.260,5076.2
17117,19167,30,services,married,high.school,no,no,no,telephone,jun,mon,2,999,0,nonexistent,1.4,94.465,-41.8,4.865,5228.1
1503,6236,36,technician,married,university.degree,no,yes,no,cellular,aug,tue,4,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1
6932,34630,24,student,single,unknown,no,yes,yes,cellular,jun,tue,1,999,1,failure,-2.9,92.963,-40.8,1.206,5076.2


In [67]:
basetable.head()

Unnamed: 0_level_0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,x0_admin.,...,x7_oct,x8_fri,x8_mon,x8_thu,x8_tue,x8_wed,x9_failure,x9_nonexistent,x9_success,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21641,-0.390559,-0.206035,0.198407,-0.345999,0.637031,0.715613,0.883734,0.701862,0.320708,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
35922,-1.350031,-0.562961,0.198407,-0.345999,0.828346,1.531383,-0.290523,0.762702,0.836134,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
2950,-0.486506,-0.562961,0.198407,-0.345999,0.828346,0.583981,-0.486232,0.759805,0.836134,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
12347,-1.541925,-0.562961,-4.997533,1.69304,-1.148576,1.098384,0.03566,-1.633241,-2.449532,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
22893,0.856754,-0.206035,0.198407,-0.345999,-0.128229,-0.659592,-0.334014,0.217458,0.387394,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [68]:

# Placeholder for results
results = {}

# Function to safely get probabilistic predictions
def safe_predict_proba(pipeline, X):
    if 'predict_proba' in dir(pipeline[-1]):
        return pipeline.predict_proba(X)[:, 1]
    return None  # For models that do not support probability estimates

# Define a base pipeline setup without the final model
base_pipeline_steps = [
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42))
]

# Evaluate each model using cross-validation and collect their scores
for name, model in models.items():
    # Create a full pipeline for the current model
    full_pipeline = ImbPipeline(base_pipeline_steps + [('model', model)])

    # Now the pipeline steps are correctly set with unique names
    scores = cross_val_score(full_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    full_pipeline.fit(X_train, y_train)
    y_pred = full_pipeline.predict(X_val)
    y_proba = safe_predict_proba(full_pipeline, X_val)

    # Store the performance metrics
    results[name] = {
        'CV Accuracy': np.mean(scores),
        'Validation Accuracy': accuracy_score(y_val, y_pred),
        'Validation AUC': roc_auc_score(y_val, y_proba) if y_proba is not None else 'N/A',
        'Validation Precision': precision_score(y_val, y_pred, zero_division=0),
        'Validation Recall': recall_score(y_val, y_pred, zero_division=0),
        'Validation F1': f1_score(y_val, y_pred, zero_division=0)
    }

# Printing the results
for model, metrics in results.items():
    print(f"{model}:")
    for metric, value in metrics.items():
        print(f" - {metric}: {value:.4f}")
    print("\n")

Logistic Regression:
 - CV Accuracy: 0.7464
 - Validation Accuracy: 0.8363
 - Validation AUC: 0.7528
 - Validation Precision: 0.3586
 - Validation Recall: 0.5617
 - Validation F1: 0.4378


Decision Tree:
 - CV Accuracy: 0.8357
 - Validation Accuracy: 0.8300
 - Validation AUC: 0.6266
 - Validation Precision: 0.2968
 - Validation Recall: 0.3634
 - Validation F1: 0.3267


Random Forest:
 - CV Accuracy: 0.8944
 - Validation Accuracy: 0.8935
 - Validation AUC: 0.7636
 - Validation Precision: 0.5598
 - Validation Recall: 0.2885
 - Validation F1: 0.3808


GBM:
 - CV Accuracy: 0.8988
 - Validation Accuracy: 0.8950
 - Validation AUC: 0.7870
 - Validation Precision: 0.5766
 - Validation Recall: 0.2819
 - Validation F1: 0.3787


Gaussian Naive Bayes:
 - CV Accuracy: 0.7351
 - Validation Accuracy: 0.7302
 - Validation AUC: 0.7606
 - Validation Precision: 0.2490
 - Validation Recall: 0.6828
 - Validation F1: 0.3649




In [69]:
metrics_df = pd.DataFrame.from_dict(results)

In [70]:
metrics_df

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,GBM,Gaussian Naive Bayes
CV Accuracy,0.746375,0.835687,0.894375,0.898813,0.735125
Validation Accuracy,0.83625,0.83,0.8935,0.895,0.73025
Validation AUC,0.752821,0.626586,0.763632,0.786968,0.76059
Validation Precision,0.35865,0.296763,0.559829,0.576577,0.248996
Validation Recall,0.561674,0.363436,0.288546,0.281938,0.682819
Validation F1,0.437768,0.326733,0.380814,0.378698,0.364921
