In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(
    'bank-full.csv',
    sep=';',
    quotechar='"',
    encoding='utf-8',
    engine='python'
)
print("--- columns ---")
print(df.columns)

print("--- Head of Dataset ---")
display(df.head())

print("\n--- Info of Dataset ---")
print(df.info())

print("\n--- Describe of Dataset ---")
print(df.describe())

--- columns ---
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')
--- Head of Dataset ---


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no



--- Info of Dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None

--- Describe of Dataset ---
                age        balance      

In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Copy dataset (safe practice)
df_clean = df.copy()

# Remove duplicate rows
print("Duplicate rows:", df_clean.duplicated().sum())
df_clean.drop_duplicates(inplace=True)

# Check missing values
print("Missing values per column:")
print(df_clean.isnull().sum())

# Handle outliers (SAFE strategy)
# 1. IQR-based removal ONLY for safe variables
safe_outlier_cols = ['age', 'day']

for col in safe_outlier_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

# 2. Winsorization (clipping) for financial / behavioral variables
winsor_cols = ['balance', 'duration', 'campaign', 'previous']

for col in winsor_cols:
    lower = df_clean[col].quantile(0.01)
    upper = df_clean[col].quantile(0.99)
    df_clean[col] = df_clean[col].clip(lower, upper)

print("Shape after outlier handling:", df_clean.shape)

# Handle special value pdays = -1
df_clean['pdays_contacted'] = df_clean['pdays'].apply(
    lambda x: 0 if x == -1 else 1
)
df_clean.drop(columns=['pdays'], inplace=True)

# Encode target variable
df_clean['y'] = df_clean['y'].map({'no': 0, 'yes': 1})

# One-Hot Encoding for categorical features
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()

df_encoded = pd.get_dummies(
    df_clean,
    columns=categorical_cols,
    drop_first=True
)

# Convert boolean columns to int
bool_cols = df_encoded.select_dtypes(include='bool').columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

# Feature Scaling (Standardization)
scaler = StandardScaler()
num_features = [
    'age', 'balance', 'day',
    'duration', 'campaign', 'previous'
]

df_encoded[num_features] = scaler.fit_transform(
    df_encoded[num_features]
)

# Final check
print("Final shape:", df_encoded.shape)
display(df_encoded.head())


Duplicate rows: 0
Missing values per column:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
Shape after outlier handling: (44724, 17)
Final shape: (44724, 43)


Unnamed: 0,age,balance,day,duration,campaign,previous,y,pdays_contacted,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,1.749275,0.392786,-1.298868,0.031546,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0.346205,-0.552943,-1.298868,-0.437198,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,-0.756207,-0.565022,-1.298868,-0.756795,-0.645438,-0.363878,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,0.646863,0.107814,-1.298868,-0.688615,-0.645438,-0.363878,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,-0.756207,-0.56547,-1.298868,-0.236916,-0.645438,-0.363878,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [5]:
from sklearn.model_selection import train_test_split


# Split features and target
X = df_encoded.drop(columns=['y'])
y = df_encoded['y']

# Train / Test Split (80 / 20)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=17,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (35779, 42)
Test shape: (8945, 42)


In [6]:
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    max_features='sqrt',
    min_samples_split=7,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Test Accuracy (RF):", accuracy_score(y_test, y_pred))
print("\nClassification Report (RF):\n", classification_report(y_test, y_pred))
print("Confusion Matrix (RF):\n", confusion_matrix(y_test, y_pred))

with open("rf_model_best.pkl", "wb") as f:
    pickle.dump(rf, f)


Test Accuracy (RF): 0.9065399664617104

Classification Report (RF):
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      7931
           1       0.59      0.58      0.59      1014

    accuracy                           0.91      8945
   macro avg       0.77      0.77      0.77      8945
weighted avg       0.91      0.91      0.91      8945

Confusion Matrix (RF):
 [[7518  413]
 [ 423  591]]


In [8]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_svm = scaler.fit_transform(X_train)
X_test_svm  = scaler.transform(X_test)

svm = SVC(
    C=1,
    gamma='scale',
    kernel='rbf',
    probability=True,
    class_weight='balanced',
    random_state=42
)

svm.fit(X_train_svm, y_train)
y_pred = svm.predict(X_test_svm)

print("Test Accuracy (SVM):", accuracy_score(y_test, y_pred))
print("\nClassification Report (SVM):\n", classification_report(y_test, y_pred))
print("Confusion Matrix (SVM):\n", confusion_matrix(y_test, y_pred))

pickle.dump(svm, open("svm_model.pkl", "wb"))
pickle.dump(scaler, open("svm_scaler.pkl", "wb"))


Test Accuracy (SVM): 0.8489659027389603

Classification Report (SVM):
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      7931
           1       0.42      0.86      0.56      1014

    accuracy                           0.85      8945
   macro avg       0.70      0.85      0.74      8945
weighted avg       0.92      0.85      0.87      8945

Confusion Matrix (SVM):
 [[6724 1207]
 [ 144  870]]


In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_mlp = scaler.fit_transform(X_train)
X_test_mlp  = scaler.transform(X_test)

mlp_1 = MLPClassifier(
    hidden_layer_sizes=(32,),
    activation='relu',
    alpha=0.01,
    learning_rate_init=0.001,
    max_iter=300,
    random_state=42
)

mlp_1.fit(X_train_mlp, y_train)
y_pred = mlp_1.predict(X_test_mlp)

print("Test Accuracy (MLP - 1 Layer):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

pickle.dump(mlp_1, open("mlp_1layer.pkl", "wb"))
pickle.dump(scaler, open("mlp_scaler.pkl", "wb"))


Test Accuracy (MLP - 1 Layer): 0.9081050866405813

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      7931
           1       0.62      0.48      0.54      1014

    accuracy                           0.91      8945
   macro avg       0.78      0.72      0.75      8945
weighted avg       0.90      0.91      0.90      8945

Confusion Matrix:
 [[7637  294]
 [ 528  486]]


In [10]:
mlp_2 = MLPClassifier(
    hidden_layer_sizes=(128, 32),
    activation='relu',
    alpha=0.01,
    learning_rate_init=0.01,
    max_iter=400,
    random_state=42
)

mlp_2.fit(X_train_mlp, y_train)
y_pred = mlp_2.predict(X_test_mlp)

print("Test Accuracy (MLP - 2 Layers):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

pickle.dump(mlp_2, open("mlp_2layer.pkl", "wb"))


Test Accuracy (MLP - 2 Layers): 0.9025153717160425

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      7931
           1       0.57      0.56      0.57      1014

    accuracy                           0.90      8945
   macro avg       0.76      0.75      0.76      8945
weighted avg       0.90      0.90      0.90      8945

Confusion Matrix:
 [[7503  428]
 [ 444  570]]


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

X_train_cnn = X_train_scaled.reshape(-1, 43, 1)
X_test_cnn  = X_test_scaled.reshape(-1, 43, 1)

cnn_2 = Sequential([
    Conv1D(32, 3, activation='relu', input_shape=(43, 1)),
    MaxPooling1D(2),

    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(2),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

cnn_2.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

cnn_2.fit(
    X_train_cnn, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

y_pred = (cnn_2.predict(X_test_cnn) > 0.5).astype(int)

print("Test Accuracy (CNN - 2 Layers):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

cnn_2.save("cnn_2layer.h5")
pickle.dump(scaler, open("cnn_scaler.pkl", "wb"))


ValueError: cannot reshape array of size 1502718 into shape (43,1)

In [13]:
cnn_1 = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(43, 1)),
    MaxPooling1D(2),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_1.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

cnn_1.fit(
    X_train_cnn, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

y_pred = (cnn_1.predict(X_test_cnn) > 0.5).astype(int)

print("Test Accuracy (CNN - 1 Layer):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

cnn_1.save("cnn_1layer.h5")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NameError: name 'X_train_cnn' is not defined