# Problem Statement :
# The objective of this project is to predict whether an individual's annual income exceeds $50K based on demographic and employment-related attributes such as age, education, occupation, and working hours.

# Complete model training : train_mpdel.py

In [17]:
# ==============================
# Import requoired libraries
# ==============================

import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import random
import warnings
warnings.filterwarnings('ignore')

In [18]:
# Kaggle credentials

os.environ['KAGGLE_USERNAME'] = 'praveenkumarbairi '
os.environ['KAGGLE_KEY']      = 'KGAT_2de7db02e0d8c410308d85c4f6e479ad'
assert os.environ['KAGGLE_USERNAME'] and os.environ['KAGGLE_KEY']
# Install Kaggle CLI
!pip -q install kaggle
!kaggle --version

Kaggle API 1.7.4.5


In [19]:
# Install Dependancies

!pip install pandas numpy scikit-learn xgboost joblib



In [20]:
# ==============================
# Download Dataset
# ==============================

!mkdir -p model
!kaggle datasets download -d uciml/adult-census-income -p model --force


Dataset URL: https://www.kaggle.com/datasets/uciml/adult-census-income
License(s): CC0-1.0
Downloading adult-census-income.zip to model
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 331MB/s]


In [21]:
# Extract and reorganize into data folder
import zipfile, os, shutil
from pathlib import Path

zip_path = Path('model/adult-census-income.zip')
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall('model')

In [22]:
#printing Data Set Counts

data = pd.read_csv("model/adult.csv")

print("✓ Adult Income - Dataset loaded successfully")

print(f"Dataset shape: {data.shape}")

print(f"Features: {list(data.columns)}")

print(f"\nFirst few rows:")
print(data.head())

# Basic statistics
print(f"\nDataset Statistics:")
print(data.describe())

✓ Adult Income - Dataset loaded successfully
Dataset shape: (32561, 15)
Features: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']

First few rows:
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?   77053       HS-grad              9        Widowed   
1   82   Private  132870       HS-grad              9        Widowed   
2   66         ?  186061  Some-college             10        Widowed   
3   54   Private  140359       7th-8th              4       Divorced   
4   41   Private  264663  Some-college             10      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female             0   
1    Exec-managerial  Not-in-family  White  Female             0   
2                  ?      Unmarried  Black  Female            

In [23]:
# Only proceed with data preprocessing if the DataFrame was loaded successfully

# ==============================
# 2️⃣ Data Preprocessing
# ==============================

df = data.copy() # Initialize df with the loaded data

# Replace '?' and ' ?' with NaN
df.replace(["?", " ?"], np.nan, inplace=True)

# Drop missing values
df.dropna(inplace=True)

# Strip whitespace from object columns
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()

# Encode target variable
df['income'] = df['income'].map({"<=50K": 0, ">50K": 1})

# Separate features and target
X = df.drop("income", axis=1)
y = df["income"]

# One-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [24]:
# ==============================
# 3️⃣ Train-Test Split
# ==============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [25]:
# =========================================
# 6️⃣ Create Test CSV for Streamlit Upload
# =========================================

# Use original test split (before scaling)
test_data = X_test.copy()
test_data["income"] = y_test.values

# Take 100 samples for lightweight Streamlit upload
test_sample = test_data.sample(n=100, random_state=42)

# Save CSV
test_sample.to_csv("model/test_data.csv", index=False)

print("\nTest data file 'test_data.csv' created successfully!")


Test data file 'test_data.csv' created successfully!


In [26]:
# ==============================
# 4️⃣ Feature Scaling
# ==============================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create model folder
if not os.path.exists("model"):
    os.makedirs("model")

# Save scaler
joblib.dump(scaler, "model/scaler.pkl")

['model/scaler.pkl']

In [27]:
# ==============================
# 5️⃣ Model Dictionary
# ==============================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

In [28]:
# ==============================
# 6️⃣ Training & Evaluation
# ==============================

results = []

for name, model in models.items():

    print(f"\nTraining {name}...")

    # Use scaled data only where required
    if name in ["Logistic Regression", "KNN", "Naive Bayes"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        name,
        round(accuracy, 4),
        round(auc, 4),
        round(precision, 4),
        round(recall, 4),
        round(f1, 4),
        round(mcc, 4)
    ])

    # Save model
    filename = f"model/{name.replace(' ', '_').lower()}.pkl"

    joblib.dump(model, filename, compress=3)


Training Logistic Regression...

Training Decision Tree...

Training KNN...

Training Naive Bayes...

Training Random Forest...

Training XGBoost...


In [29]:
# ==============================
# 7️⃣ Display Results
# ==============================

results_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]
)

print("\n==================================")
print("  Model Score Comparison Results")
print("==================================")
print(results_df)

# Save results to CSV
results_df.to_csv("model/model_results.csv", index=False)


  Model Score Comparison Results
                 Model  Accuracy     AUC  Precision  Recall      F1     MCC
0  Logistic Regression    0.8548  0.9132     0.7504  0.6245  0.6817  0.5928
1        Decision Tree    0.8583  0.8990     0.7870  0.5905  0.6748  0.5964
2                  KNN    0.8258  0.8497     0.6667  0.6005  0.6319  0.5194
3          Naive Bayes    0.5488  0.7761     0.3514  0.9601  0.5144  0.3454
4        Random Forest    0.8550  0.9104     0.7467  0.6318  0.6845  0.5946
5              XGBoost    0.8722  0.9338     0.7903  0.6625  0.7208  0.6429
