
# Logistic Regression (From Scratch) — Supervised Learning

This notebook:
1. Loads a dataset from your repo's `datasets/` folder
2. Preprocesses features (numeric + one-hot for categoricals)
3. Trains **Logistic Regression from scratch** (gradient descent)
4. Compares against `sklearn.linear_model.LogisticRegression`
5. Evaluates with Accuracy, Precision, Recall, F1, ROC AUC
6. Plots ROC curve


In [None]:

# Config: set your dataset path and target column here
DATA_PATH = 'datasets/census_dataset.csv'   # <- change if needed
TARGET_COL = 'target'                       # <- set to your binary target column name (0/1)


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression as SkLogReg

import sys
sys.path.append('src/supervised/logistic_regression')
from logistic_regression import LogisticRegressionGD


In [None]:

# 1) Load data
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


In [None]:

# 2) Split into features/target
if TARGET_COL not in df.columns:
    raise ValueError(f'TARGET_COL={TARGET_COL!r} not found in columns: {df.columns.tolist()}')

y = df[TARGET_COL].astype(int)
X_raw = df.drop(columns=[TARGET_COL])

# 3) Basic preprocessing
num_cols = X_raw.select_dtypes(include=['number']).columns.tolist()
cat_cols = [c for c in X_raw.columns if c not in num_cols]

X_cat = pd.get_dummies(X_raw[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=X_raw.index)
X_num = X_raw[num_cols].copy()

X_num = X_num.fillna(X_num.median())
if not X_cat.empty:
    X_cat = X_cat.fillna(0)

X = pd.concat([X_num, X_cat], axis=1)
print(f'Final design matrix shape: {X.shape} (num={len(num_cols)}, cat_expanded={X.shape[1]-len(num_cols)})')
X.head()


In [None]:

# 4) Train/test split
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42, stratify=y)

# 5) Train from-scratch logistic regression
model = LogisticRegressionGD(lr=0.1, n_iter=3000, fit_intercept=True, random_state=0)
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)
y_pred = (y_proba >= 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)

print('From-scratch Logistic Regression:')
print({'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': auc})


In [None]:

# 6) Compare with scikit-learn
sk = SkLogReg(max_iter=2000)
sk.fit(X_train, y_train)
sk_proba = sk.predict_proba(X_test)[:, 1]
sk_pred = (sk_proba >= 0.5).astype(int)

acc2 = accuracy_score(y_test, sk_pred)
prec2 = precision_score(y_test, sk_pred, zero_division=0)
rec2 = recall_score(y_test, sk_pred, zero_division=0)
f12 = f1_score(y_test, sk_pred, zero_division=0)
auc2 = roc_auc_score(y_test, sk_proba)

print('scikit-learn LogisticRegression:')
print({'accuracy': acc2, 'precision': prec2, 'recall': rec2, 'f1': f12, 'roc_auc': auc2})


In [None]:

# 7) Plot ROC curves
fpr1, tpr1, _ = roc_curve(y_test, y_proba)
fpr2, tpr2, _ = roc_curve(y_test, sk_proba)

plt.figure()
plt.plot(fpr1, tpr1, label='From-scratch')
plt.plot(fpr2, tpr2, label='sklearn')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve — Logistic Regression')
plt.legend()
plt.show()


In [None]:

# 8) Inspect training loss
plt.figure()
plt.plot(model.loss_)
plt.xlabel('Iteration')
plt.ylabel('Binary Cross-Entropy Loss')
plt.title('Training Loss — From-scratch Logistic Regression')
plt.show()
