
**Dataset Selected**

UCI Bank Marketing Dataset

(Source: UCI Machine Learning Repository)

Why this dataset fits perfectly

Type: Binary Classification

Target Variable: y

Instances: 45,211

Features: 16 original features (42 after encoding)

Real-world marketing dataset

Large structured tabular data

Contains categorical + numerical features

Widely used in ML coursework

Suitable for evaluating multiple classification algorithms

No plagiarism risk if implemented independently

  - Target Variable

Class

1 → Client subscribed to term deposit

0 → Client did not subscribe

STEP 2: Models & Metrics (UNCHANGED)
✔ Models Implemented (ALL 6)

Logistic Regression

Decision Tree Classifier

K-Nearest Neighbors

Naive Bayes (Gaussian)

Random Forest (Ensemble)

XGBoost (Ensemble)

✔ Evaluation Metrics

Accuracy

AUC Score

Precision

Recall

F1 Score

MCC Score

In [1]:
!pip install scikit-learn==1.8.0


Collecting scikit-learn==1.8.0
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.8.0


In [2]:
import sklearn
print(sklearn.__version__)


1.8.0


In [3]:
pip install numpy pandas scikit-learn matplotlib seaborn xgboost streamlit

Collecting streamlit
  Downloading streamlit-1.54.0-py3-none-any.whl.metadata (9.8 kB)
Collecting cachetools<7,>=5.5 (from streamlit)
  Downloading cachetools-6.2.6-py3-none-any.whl.metadata (5.6 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.54.0-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cachetools-6.2.6-py3-none-any.whl (11 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cachetools, pydeck, streamlit
  Attempting uninstall: cachetools
    Found existing installation: cachetools 7.0.0
    Uninstalling cachetools-7.0.0:
      Successfully uninstalled cachetools-7.0.0
Successfully installed cachetools-6.2.6 pydeck-0

In [4]:
import pandas as pd
import zipfile
import io
import requests

url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"

# Download main zip
response = requests.get(url)
main_zip = zipfile.ZipFile(io.BytesIO(response.content))

print("Main ZIP files:", main_zip.namelist())

# Extract inner bank.zip
inner_zip_data = main_zip.read("bank.zip")
inner_zip = zipfile.ZipFile(io.BytesIO(inner_zip_data))

print("Inner ZIP files:", inner_zip.namelist())

# Now read bank-full.csv
df = pd.read_csv(inner_zip.open("bank-full.csv"), sep=";")

print(df.shape)
df.head()
df["y"] = df["y"].map({"no": 0, "yes": 1})
df["y"].value_counts()



Main ZIP files: ['bank.zip', 'bank-additional.zip']
Inner ZIP files: ['bank-full.csv', 'bank-names.txt', 'bank.csv']
(45211, 17)


Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0,39922
1,5289


Separate Features & Target

In [5]:
X = df.drop("y", axis=1)
y = df["y"]

print("Initial feature count:", X.shape[1])

X = pd.get_dummies(X, drop_first=True)

print("Feature count after encoding:", X.shape[1])



Initial feature count: 16
Feature count after encoding: 42


Train–Test Split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)



Train shape: (36168, 42)
Test shape: (9043, 42)


Feature Scaling required for Logistic Regression and KNN

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Train ALL 6 Models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

nb = GaussianNB()
nb.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


Calculate All Required Metrics

In [9]:
def evaluate_model(y_test, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }


Comparison Table (For README & PDF)

In [10]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

def evaluate_model(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

results = {
    "Logistic Regression": evaluate_model(lr),
    "Decision Tree": evaluate_model(dt),
    "KNN": evaluate_model(knn),
    "Naive Bayes": evaluate_model(nb),
    "Random Forest": evaluate_model(rf),
    "XGBoost": evaluate_model(xgb)
}

comparison_df = pd.DataFrame(results).T.round(4)

comparison_df


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.8988,0.9046,0.6544,0.3419,0.4491,0.425
Decision Tree,0.8708,0.7039,0.466,0.484,0.4748,0.4013
KNN,0.8933,0.8296,0.6043,0.3346,0.4307,0.3973
Naive Bayes,0.8569,0.8151,0.4181,0.4748,0.4446,0.3639
Random Forest,0.9037,0.924,0.6692,0.3987,0.4997,0.4687
XGBoost,0.9069,0.9297,0.647,0.5023,0.5655,0.5195


Observations

The ensemble models — Random Forest and XGBoost — achieved the highest Accuracy and AUC scores, indicating superior predictive performance and better generalization capability.

Logistic Regression and Naive Bayes also performed strongly, demonstrating that simpler linear and probabilistic models can still achieve competitive results on structured tabular data.

The Decision Tree model showed slightly lower generalization performance compared to ensemble models, which is expected due to its tendency to overfit.

Overall, ensemble methods provided better robustness and stability compared to single classifiers, making them more suitable for this dataset.

Save Models (For Streamlit)

In [11]:
import os
import joblib

# Create model folder
os.makedirs("ml-assignment-2/model", exist_ok=True)

# Save models
joblib.dump(lr, "ml-assignment-2/model/logistic_regression.pkl")
joblib.dump(dt, "ml-assignment-2/model/decision_tree.pkl")
joblib.dump(knn, "ml-assignment-2/model/knn.pkl")
joblib.dump(nb, "ml-assignment-2/model/naive_bayes.pkl")
joblib.dump(rf, "ml-assignment-2/model/random_forest.pkl")
joblib.dump(xgb, "ml-assignment-2/model/xgboost.pkl")

# Save scaler
joblib.dump(scaler, "ml-assignment-2/model/scaler.pkl")

# VERY IMPORTANT: Save feature column names
joblib.dump(X.columns, "ml-assignment-2/model/feature_columns.pkl")

print("All models and preprocessing objects saved successfully!")




All models and preprocessing objects saved successfully!


Streamlit App (app.py)

In [12]:
import streamlit as st
import joblib
import pandas as pd

st.set_page_config(page_title="Bank Marketing ML App")

st.title("🏦 Bank Marketing Classification App")
st.write("Upload Bank Marketing CSV file to predict term deposit subscription.")

# Load preprocessing objects (COLAB PATH)
scaler = joblib.load("ml-assignment-2/model/scaler.pkl")
feature_columns = joblib.load("ml-assignment-2/model/feature_columns.pkl")

# Model selection
model_choice = st.selectbox(
    "Select Model",
    ["Logistic Regression", "Decision Tree", "KNN", "Naive Bayes", "Random Forest", "XGBoost"]
)

model_files = {
    "Logistic Regression": "ml-assignment-2/model/logistic_regression.pkl",
    "Decision Tree": "ml-assignment-2/model/decision_tree.pkl",
    "KNN": "ml-assignment-2/model/knn.pkl",
    "Naive Bayes": "ml-assignment-2/model/naive_bayes.pkl",
    "Random Forest": "ml-assignment-2/model/random_forest.pkl",
    "XGBoost": "ml-assignment-2/model/xgboost.pkl"
}

uploaded_file = st.file_uploader("Upload CSV File", type=["csv"])

if uploaded_file is not None:

    df = pd.read_csv(uploaded_file, sep=";")

    # Convert target if exists
    if "y" in df.columns:
        df["y"] = df["y"].map({"no": 0, "yes": 1})
        df = df.drop("y", axis=1)

    # One-hot encoding
    df = pd.get_dummies(df, drop_first=True)

    # Align columns
    df = df.reindex(columns=feature_columns, fill_value=0)

    # Scale features
    df_scaled = scaler.transform(df)

    # Load selected model
    model = joblib.load(model_files[model_choice])

    # Predict
    predictions = model.predict(df_scaled)

    df["Prediction"] = predictions

    st.subheader("Prediction Results")
    st.write(df.head())

    st.success("Prediction completed successfully!")



2026-02-15 08:07:17.480 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2026-02-15 08:07:17.501 Session state does not function when running a script without `streamlit run`


In [13]:
import os

# Root project folder
project_root = "ml-assignment-2"

# Create folders
os.makedirs(f"{project_root}/model", exist_ok=True)

print("Folder structure created!")



Folder structure created!


In [14]:
#Create requirements.txt
requirements_content = """streamlit
numpy
pandas
scikit-learn
xgboost
matplotlib
seaborn
joblib
"""

with open("ml-assignment-2/requirements.txt", "w") as f:
    f.write(requirements_content)

print("requirements.txt created!")


requirements.txt created!


In [15]:
readme_content = """# ML Assignment 2 – Classification Models & Streamlit Deployment

## Problem Statement
Implement multiple machine learning classification models, evaluate their performance,
and deploy the models using a Streamlit web application.

## Dataset Description
The dataset used is the UCI Bank Marketing Dataset from the UCI Machine Learning Repository.
It contains 45,211 instances and 16 original predictive features related to client demographic
and marketing campaign information.

After one-hot encoding of categorical variables, the total number of features increased to 42.

The target variable is binary:
1 → Client subscribed to term deposit
0 → Client did not subscribe

## Models Used
- Logistic Regression
- Decision Tree Classifier
- K-Nearest Neighbors
- Naive Bayes
- Random Forest (Ensemble)
- XGBoost (Ensemble)

## Evaluation Metrics
Accuracy, AUC, Precision, Recall, F1 Score, Matthews Correlation Coefficient (MCC)

## Observations
Ensemble models such as Random Forest and XGBoost achieved the highest Accuracy and AUC,
indicating better generalization performance. Logistic Regression and Naive Bayes also
performed strongly, while Decision Tree showed comparatively lower generalization.
Overall, ensemble methods provided better robustness compared to individual classifiers.

## Deployment
The application is deployed using Streamlit Community Cloud.
"""

with open("ml-assignment-2/README.md", "w") as f:
    f.write(readme_content)

print("README.md updated successfully!")


README.md updated successfully!


In [16]:
# Create app.py

app_content = """import streamlit as st

st.set_page_config(page_title="ML Classification App")

st.title("Machine Learning Classification App")
st.write("Upload a dataset and select a model to view predictions and metrics.")
"""

with open("ml-assignment-2/app.py", "w") as f:
    f.write(app_content)

print("app.py created!")


app.py created!


In [17]:
#Save Models Programmatically into model/

import joblib

joblib.dump(lr, "ml-assignment-2/model/logistic_regression.pkl")
joblib.dump(dt, "ml-assignment-2/model/decision_tree.pkl")
joblib.dump(knn, "ml-assignment-2/model/knn.pkl")
joblib.dump(nb, "ml-assignment-2/model/naive_bayes.pkl")
joblib.dump(rf, "ml-assignment-2/model/random_forest.pkl")
joblib.dump(xgb, "ml-assignment-2/model/xgboost.pkl")

# Save scaler
joblib.dump(scaler, "ml-assignment-2/model/scaler.pkl")

# Save feature columns (VERY IMPORTANT for Streamlit)
joblib.dump(X.columns, "ml-assignment-2/model/feature_columns.pkl")

print("All models and preprocessing objects saved successfully!")



All models and preprocessing objects saved successfully!


In [18]:
# Verify

import os

print("Root files:", os.listdir("ml-assignment-2"))
print("Model files:", os.listdir("ml-assignment-2/model"))


Root files: ['README.md', 'app.py', 'requirements.txt', 'model']
Model files: ['knn.pkl', 'naive_bayes.pkl', 'xgboost.pkl', 'scaler.pkl', 'random_forest.pkl', 'feature_columns.pkl', 'logistic_regression.pkl', 'decision_tree.pkl']


In [19]:
!zip -r ml-assignment-2.zip ml-assignment-2


  adding: ml-assignment-2/ (stored 0%)
  adding: ml-assignment-2/README.md (deflated 47%)
  adding: ml-assignment-2/app.py (deflated 32%)
  adding: ml-assignment-2/requirements.txt (deflated 11%)
  adding: ml-assignment-2/model/ (stored 0%)
  adding: ml-assignment-2/model/knn.pkl (deflated 91%)
  adding: ml-assignment-2/model/naive_bayes.pkl (deflated 13%)
  adding: ml-assignment-2/model/xgboost.pkl (deflated 66%)
  adding: ml-assignment-2/model/scaler.pkl (deflated 25%)
  adding: ml-assignment-2/model/random_forest.pkl (deflated 84%)
  adding: ml-assignment-2/model/feature_columns.pkl (deflated 45%)
  adding: ml-assignment-2/model/logistic_regression.pkl (deflated 23%)
  adding: ml-assignment-2/model/decision_tree.pkl (deflated 84%)
