In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(
    'bank-full.csv',
    sep=';',
    quotechar='"',
    encoding='utf-8',
    engine='python'
)
print("--- columns ---")
print(df.columns)

print("--- Head of Dataset ---")
display(df.head())

print("\n--- Info of Dataset ---")
print(df.info())

print("\n--- Describe of Dataset ---")
print(df.describe())

--- columns ---
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')
--- Head of Dataset ---


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no



--- Info of Dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None

--- Describe of Dataset ---
                age        balance      

In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Copy dataset (safe practice)
df_clean = df.copy()

# Remove duplicate rows
print("Duplicate rows:", df_clean.duplicated().sum())
df_clean.drop_duplicates(inplace=True)

# Check missing values
print("Missing values per column:")
print(df_clean.isnull().sum())

# Handle outliers (SAFE strategy)
# 1. IQR-based removal ONLY for safe variables
safe_outlier_cols = ['age', 'day']

for col in safe_outlier_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

# 2. Winsorization (clipping) for financial / behavioral variables
winsor_cols = ['balance', 'duration', 'campaign', 'previous']

for col in winsor_cols:
    lower = df_clean[col].quantile(0.01)
    upper = df_clean[col].quantile(0.99)
    df_clean[col] = df_clean[col].clip(lower, upper)

print("Shape after outlier handling:", df_clean.shape)

# Handle special value pdays = -1
df_clean['pdays_contacted'] = df_clean['pdays'].apply(
    lambda x: 0 if x == -1 else 1
)
df_clean.drop(columns=['pdays'], inplace=True)

# Encode target variable
df_clean['y'] = df_clean['y'].map({'no': 0, 'yes': 1})

# One-Hot Encoding for categorical features
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()

df_encoded = pd.get_dummies(
    df_clean,
    columns=categorical_cols,
    drop_first=True
)

# Convert boolean columns to int
bool_cols = df_encoded.select_dtypes(include='bool').columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

# Feature Scaling (Standardization)
scaler = StandardScaler()
num_features = [
    'age', 'balance', 'day',
    'duration', 'campaign', 'previous'
]

df_encoded[num_features] = scaler.fit_transform(
    df_encoded[num_features]
)

# Final check
print("Final shape:", df_encoded.shape)
display(df_encoded.head())


Duplicate rows: 0
Missing values per column:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
Shape after outlier handling: (44724, 17)
Final shape: (44724, 43)


Unnamed: 0,age,balance,day,duration,campaign,previous,y,pdays_contacted,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,1.749275,0.392786,-1.298868,0.031546,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0.346205,-0.552943,-1.298868,-0.437198,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,-0.756207,-0.565022,-1.298868,-0.756795,-0.645438,-0.363878,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,0.646863,0.107814,-1.298868,-0.688615,-0.645438,-0.363878,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,-0.756207,-0.56547,-1.298868,-0.236916,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [5]:
from sklearn.model_selection import train_test_split


# Split features and target
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

# Train / Test Split (80 / 20)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=17,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (35779, 42)
Test shape: (8945, 42)


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Random Forest model
rf = RandomForestClassifier(
    random_state=17,
    n_jobs=-1
)

# Hyperparameter Grid
param_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 7, 10],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2']
}

# Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

# Best Model
best_rf = grid_search.best_estimator_

print("Best Hyperparameters:")
print(grid_search.best_params_)

# Evaluation on Test Set
y_pred = best_rf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save Best Model
joblib.dump(best_rf, "best_random_forest_model.pkl")

print("\nBest Random Forest model saved successfully.")


Train shape: (35779, 42)
Test shape: (8945, 42)
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
Best Hyperparameters:
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 100}

Test Accuracy: 0.9104527669088877

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      7931
           1       0.68      0.40      0.50      1014

    accuracy                           0.91      8945
   macro avg       0.80      0.69      0.73      8945
weighted avg       0.90      0.91      0.90      8945


Confusion Matrix:
[[7741  190]
 [ 611  403]]

Best Random Forest model saved successfully.


In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# SVM model
svm_model = SVC(
    probability=True,   # needed for ROC / probability analysis later
    random_state=17
)

# Hyperparameter Grid
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

# Grid Search
grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid_svm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_search_svm.fit(X_train, y_train)

# Best Model
best_svm = grid_search_svm.best_estimator_

print("Best Hyperparameters (SVM):")
print(grid_search_svm.best_params_)

# Evaluation on Test Set
y_pred_svm = best_svm.predict(X_test)

print("\nTest Accuracy (SVM):", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report (SVM):")
print(classification_report(y_test, y_pred_svm))

print("\nConfusion Matrix (SVM):")
print(confusion_matrix(y_test, y_pred_svm))

# Save Best Model
joblib.dump(best_svm, "best_svm_model.pkl")

print("\nBest SVM model saved successfully.")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Hyperparameters (SVM):
{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

Test Accuracy (SVM): 0.9064281721632197

Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      7931
           1       0.66      0.35      0.46      1014

    accuracy                           0.91      8945
   macro avg       0.79      0.67      0.70      8945
weighted avg       0.89      0.91      0.89      8945


Confusion Matrix (SVM):
[[7750  181]
 [ 656  358]]

Best SVM model saved successfully.


In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

mlp_1layer = MLPClassifier(
    max_iter=200,
    random_state=17
)

# گرید سرچ
param_grid_1layer = {
    'hidden_layer_sizes': [(32,), (64,), (128,)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

grid_mlp_1layer = GridSearchCV(
    mlp_1layer,
    param_grid_1layer,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_mlp_1layer.fit(X_train, y_train)

best_mlp_1layer = grid_mlp_1layer.best_estimator_

print("Best Hyperparameters (MLP - 1 Layer):")
print(grid_mlp_1layer.best_params_)

y_pred = best_mlp_1layer.predict(X_test)

print("\nTest Accuracy (MLP - 1 Layer):", accuracy_score(y_test, y_pred))
print("\nClassification Report (MLP - 1 Layer):")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix (MLP - 1 Layer):")
print(confusion_matrix(y_test, y_pred))


joblib.dump(best_mlp_1layer, "best_mlp_1layer.pkl")
print("\nBest MLP (1 Layer) model saved successfully.")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Hyperparameters (MLP - 1 Layer):
{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (32,), 'learning_rate_init': 0.001}

Test Accuracy (MLP - 1 Layer): 0.9126886528787032

Classification Report (MLP - 1 Layer):
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7931
           1       0.66      0.46      0.55      1014

    accuracy                           0.91      8945
   macro avg       0.80      0.72      0.75      8945
weighted avg       0.90      0.91      0.91      8945


Confusion Matrix (MLP - 1 Layer):
[[7693  238]
 [ 543  471]]

Best MLP (1 Layer) model saved successfully.




In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

mlp_2layer = MLPClassifier(
    max_iter=300,
    random_state=17
)

param_grid_2layer = {
    'hidden_layer_sizes': [(64, 32), (128, 64), (128, 32)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

grid_mlp_2layer = GridSearchCV(
    mlp_2layer,
    param_grid_2layer,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

grid_mlp_2layer.fit(X_train, y_train)

best_mlp_2layer = grid_mlp_2layer.best_estimator_

print("Best Hyperparameters (MLP - 2 Layers):")
print(grid_mlp_2layer.best_params_)

y_pred = best_mlp_2layer.predict(X_test)

print("\nTest Accuracy (MLP - 2 Layers):", accuracy_score(y_test, y_pred))
print("\nClassification Report (MLP - 2 Layers):")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix (MLP - 2 Layers):")
print(confusion_matrix(y_test, y_pred))

joblib.dump(best_mlp_2layer, "best_mlp_2layer.pkl")
print("\nBest MLP (2 Layers) model saved successfully.")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Hyperparameters (MLP - 2 Layers):
{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (128, 32), 'learning_rate_init': 0.01}

Test Accuracy (MLP - 2 Layers): 0.8990497484628284

Classification Report (MLP - 2 Layers):
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      7931
           1       0.55      0.57      0.56      1014

    accuracy                           0.90      8945
   macro avg       0.75      0.76      0.75      8945
weighted avg       0.90      0.90      0.90      8945


Confusion Matrix (MLP - 2 Layers):
[[7464  467]
 [ 436  578]]

Best MLP (2 Layers) model saved successfully.


In [15]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# reshape data for CNN
X_train_cnn = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn  = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

model_cnn_1 = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(42, 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_cnn_1.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history_1 = model_cnn_1.fit(
    X_train_cnn, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# evaluation
y_pred_prob = model_cnn_1.predict(X_test_cnn)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Test Accuracy (CNN - 1 Conv Layer):", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# save model
model_cnn_1.save("cnn_1layer_100v1.h5")
print("\nCNN (1 layer) model saved successfully.")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8874 - loss: 0.2816 - val_accuracy: 0.8985 - val_loss: 0.2244
Epoch 2/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9055 - loss: 0.2224 - val_accuracy: 0.8994 - val_loss: 0.2196
Epoch 3/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9044 - loss: 0.2166 - val_accuracy: 0.9011 - val_loss: 0.2155
Epoch 4/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9062 - loss: 0.2132 - val_accuracy: 0.9002 - val_loss: 0.2126
Epoch 5/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9073 - loss: 0.2135 - val_accuracy: 0.9027 - val_loss: 0.2114
Epoch 6/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9095 - loss: 0.2036 - val_accuracy: 0.9039 - val_loss: 0.2085
Epoch 7/100
[1m504/504[0m [32m━



              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7931
           1       0.66      0.43      0.52      1014

    accuracy                           0.91      8945
   macro avg       0.80      0.70      0.74      8945
weighted avg       0.90      0.91      0.90      8945


Confusion Matrix:
[[7708  223]
 [ 577  437]]

CNN (1 layer) model saved successfully.


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model_cnn_2 = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(42, 1)),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_cnn_2.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history_2 = model_cnn_2.fit(
    X_train_cnn, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# evaluation
y_pred_prob = model_cnn_2.predict(X_test_cnn)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Test Accuracy (CNN - 2 Conv Layers):", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# save model
model_cnn_2.save("cnn_2layer_100v1.h5")
print("\nCNN (2 layers) model saved successfully.")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8832 - loss: 0.2922 - val_accuracy: 0.8971 - val_loss: 0.2267
Epoch 2/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9036 - loss: 0.2213 - val_accuracy: 0.9002 - val_loss: 0.2184
Epoch 3/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9039 - loss: 0.2131 - val_accuracy: 0.9025 - val_loss: 0.2137
Epoch 4/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9047 - loss: 0.2153 - val_accuracy: 0.9016 - val_loss: 0.2118
Epoch 5/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9057 - loss: 0.2156 - val_accuracy: 0.9016 - val_loss: 0.2099
Epoch 6/100
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9086 - loss: 0.2096 - val_accuracy: 0.8999 - val_loss: 0.2083
Epoch 7/100
[1m504/504[0m [32m━



              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7931
           1       0.63      0.44      0.52      1014

    accuracy                           0.91      8945
   macro avg       0.78      0.70      0.73      8945
weighted avg       0.90      0.91      0.90      8945


Confusion Matrix:
[[7668  263]
 [ 566  448]]

CNN (2 layers) model saved successfully.


In [17]:
# Save Test Set with Labels to CSV

# Combine X_test and y_test
test_with_labels = X_test.copy()
test_with_labels['y'] = y_test.values

# Reset index for clean CSV
test_with_labels.reset_index(drop=True, inplace=True)

# Save to CSV
test_with_labels.to_csv(
    'bank_marketing_test_set.csv',
    index=False,
    encoding='utf-8'
)

print("✅ Test dataset with labels saved successfully.")
print("Shape:", test_with_labels.shape)

display(test_with_labels.head())


✅ Test dataset with labels saved successfully.
Shape: (8945, 43)


Unnamed: 0,age,balance,day,duration,campaign,previous,pdays_contacted,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,y
0,2.45081,0.406207,-0.097627,1.233235,0.488541,0.350587,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.157084,-0.435734,-1.65924,-0.207087,0.110548,-0.363878,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
2,-0.35533,-0.556075,-0.938495,-0.398846,-0.645438,-0.363878,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,-1.457742,-0.824941,0.022497,0.282963,0.110548,-0.363878,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0.747082,-0.554286,-0.457999,0.244612,0.110548,-0.363878,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
