# Load the dataset and examine its structure

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("C:\\Users\\LabUser\\Downloads\\bank-full columns.csv")
df.head()

In [None]:
# Setup X and y variables
X = df.drop(columns=['contact', 'y'])
y = df['y'].values.reshape(-1,1)


# Handle missing data

In [None]:
# Handling missing values (if any)

# For numerical columns
df.fillna(df.mean(), inplace=True)  

# For categorical columns
df.fillna('Unknown', inplace=True) 

In [None]:
# Identify numerical columns
numerical_columns = ['balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Identify categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Fill numerical columns with mean
df[numerical_columns] =df[numerical_columns].fillna(df[numerical_columns].mean())

# Fill categorical columns with 'Unknown'
df[categorical_columns] =df[categorical_columns].fillna('Unknown')


In [None]:
# Apply OneHotEncoder to categorical columns
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = onehot_encoder.fit_transform(df[categorical_columns])


In [None]:
# Create a DataFrame for the encoded columns
encoded_df = pd.DataFrame(encoded_categorical, columns=onehot_encoder.get_feature_names_out(categorical_columns))

In [None]:
# Drop original categorical columns and concatenate the encoded columns
df = df.drop(columns=categorical_columns)
df = pd.concat([df, encoded_df], axis=1)

# Update X variable after encoding
X = df.drop(columns=['y'])

print(df.head())

In [None]:
# Find the percentage of null values in each column
null_percentage = df.isna().mean() * 100
print(null_percentage)

In [None]:
# Explore each column with missing values to determine the best fill strategy
# First the job column
df['job'].value_counts()

In [None]:
# Scaling numerical features
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print(df.head())

In [None]:
#  Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)
X_train.describe()

# Exploratory Data Analysis (EDA)

In [None]:
# Create new features if needed
# Example: Creating a new feature 'balance_duration_ratio'
df['balance_duration_ratio'] = df['balance'] / (df['duration'] + 1)  

print(df.head())

# Select most relevant features using  correlation analysis 

In [None]:
# Correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='autumn')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Plot distributions of numerical features
numerical_columns = ['balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
df[numerical_columns].hist(bins=15, figsize=(15, 10))
plt.show()

In [None]:
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.violinplot(x='y', y=column, data=df, color='purple')
    plt.title(f'Violin plot of {column} by target variable')
    plt.show()

# Feature Engineering

In [None]:
# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = onehot_encoder.fit_transform(df[categorical_columns])

In [None]:
# Drop original categorical columns and concatenate the encoded columns
df = df.drop(columns=categorical_columns)
encoded_df = pd.DataFrame(encoded_categorical, columns=onehot_encoder.get_feature_names_out(categorical_columns))
df = pd.concat([df, encoded_df], axis=1)

print(df.head())

# Model Selection and Training

# Choose Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Initialize models
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()

# Train and Evaluate Models

In [None]:
# Train models
log_reg.fit(X_train, y_train.ravel())
decision_tree.fit(X_train, y_train.ravel())
random_forest.fit(X_train, y_train.ravel())
svm.fit(X_train, y_train.ravel())
knn.fit(X_train, y_train.ravel())

In [None]:
# Predict on test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_decision_tree = decision_tree.predict(X_test)
y_pred_random_forest = random_forest.predict(X_test)
y_pred_svm = svm.predict(X_test)
y_pred_knn = knn.predict(X_test)

In [None]:
# Evaluate models
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'KNN']
predictions = [y_pred_log_reg, y_pred_decision_tree, y_pred_random_forest, y_pred_svm, y_pred_knn]

for model, y_pred in zip(models, predictions):
    print(f"Model: {model}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='binary'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='binary'):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='binary'):.4f}")
    print("\n")

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define parameter grids
param_grid_log_reg = {'C': [0.1, 1, 10, 100]}
param_grid_decision_tree = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
param_grid_random_forest = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
param_grid_svm = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}

In [None]:
# Initialize GridSearchCV
grid_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='accuracy')
grid_decision_tree = GridSearchCV(decision_tree, param_grid_decision_tree, cv=5, scoring='accuracy')
grid_random_forest = GridSearchCV(random_forest, param_grid_random_forest, cv=5, scoring='accuracy')
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')

In [None]:
# Fit models
grid_log_reg.fit(X_train, y_train.ravel())
grid_decision_tree.fit(X_train, y_train.ravel())
grid_random_forest.fit(X_train, y_train.ravel())
grid_svm.fit(X_train, y_train.ravel())
grid_knn.fit(X_train, y_train.ravel())

In [None]:
# Best parameters
print("Best parameters for Logistic Regression:", grid_log_reg.best_params_)
print("Best parameters for Decision Tree:", grid_decision_tree.best_params_)
print("Best parameters for Random Forest:", grid_random_forest.best_params_)
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best parameters for KNN:", grid_knn.best_params_)

# Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation scores
cv_log_reg = cross_val_score(grid_log_reg.best_estimator_, X_train, y_train.ravel(), cv=5, scoring='accuracy')
cv_decision_tree = cross_val_score(grid_decision_tree.best_estimator_, X_train, y_train.ravel(), cv=5, scoring='accuracy')
cv_random_forest = cross_val_score(grid_random_forest.best_estimator_, X_train, y_train.ravel(), cv=5, scoring='accuracy')
cv_svm = cross_val_score(grid_svm.best_estimator_, X_train, y_train.ravel(), cv=5, scoring='accuracy')
cv_knn = cross_val_score(grid_knn.best_estimator_, X_train, y_train.ravel(), cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores for Logistic Regression:", cv_log_reg)
print("Cross-validation scores for Decision Tree:", cv_decision_tree)
print("Cross-validation scores for Random Forest:", cv_random_forest)
print("Cross-validation scores for SVM:", cv_svm)
print("Cross-validation scores for KNN:", cv_knn)

# Select most relevant features using  Feature Importance

In [None]:
# Feature importance from Random Forest
importances = grid_random_forest.best_estimator_.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()