 # Logistic Regression with Stochastic Gradient Descent (From Scratch)

 ## Introduction

 This notebook implements Logistic Regression with SGD from scratch on the breast cancer dataset.

 ## Data Loading

 Load the breast cancer data.

In [None]:
import os
import sys

import numpy as np

# Set project root directory and add it to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
sys.path.append(project_root)


from src.scratch.utils.viz_utils import plot_scatter_for_classification

X_train = np.load("../../../data/processed/breast_cancer_X_train.npy")
X_test = np.load("../../../data/processed/breast_cancer_X_test.npy")
y_train = np.load("../../../data/processed/breast_cancer_y_train.npy")
y_test = np.load("../../../data/processed/breast_cancer_y_test.npy")

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Test target shape:", y_test.shape)

 ## Exploratory Data Analysis

 Visualize two features colored by class (if possible).

In [None]:
if X_train.shape[1] >= 2:
    plot_scatter_for_classification(
        X_train,
        y_train,
        feature_indices=(0, 1),
        title="Feature 1 vs Feature 2",
        filename="feature1_vs_feature2_sgd_scratch.png",
    )
else:
    print("Not enough features for scatter plot.")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.scatter(X_train[:, 0], y_train, color="blue", alpha=0.7)
plt.xlabel("Feature")
plt.ylabel("Diagnosis")
plt.title("Sample Data Visualization (1D)")
plt.grid(True)
plt.show()

 ## Model Initialization

 Initialize with SGD.

In [None]:
from src.scratch.models.logistic_regression import LogisticRegression

model = LogisticRegression(
    method="stochastic_gd",
    learning_rate=0.01,
    n_iterations=2,
    verbose=True,
    lr_decay=0.005,
    early_stopping=True,
)

 ## Training

 Train and time the model.

In [None]:
import time

start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time
print(f"Training Time: {training_time:.4f} seconds")

 ## Evaluation

 Calculate classification metrics.

In [None]:
from src.scratch.utils.metrics import (
    accuracy,
    precision_score,
    recall_score,
    f1_score,
    compute_roc_auc_score,
)
from src.scratch.utils.math_utils import sigmoid

y_pred = model.predict(X_test)
# Approximate probabilities for ROC AUC
y_scores = sigmoid(
    model.predict(X_test)
)  # Using predict as a proxy since no predict_proba

acc = accuracy(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = compute_roc_auc_score(y_test, y_scores)

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

 ## Visualizations

 Visualize results.

In [None]:
from src.scratch.utils.viz_utils import (
    plot_confusion_matrix,
    plot_decision_boundary,
    plot_learning_curve,
    plot_precision_recall_curve,
    plot_roc_curve,
)

plot_learning_curve(
    model.get_loss_history(),
    title="Learning Curve (SGD Scratch)",
    filename="learning_curve_sgd_scratch_log.png",
)

if X_test.shape[1] == 2:

    plot_decision_boundary(
        model,
        X_test,
        y_test,
        title="Decision Boundary (SGD Scratch)",
        filename="decision_boundary_sgd_scratch.png",
    )

plot_confusion_matrix(
    y_test,
    y_pred,
    title="Confusion Matrix (SGD Scratch)",
    filename="confusion_matrix_sgd_scratch.png",
)
plot_roc_curve(
    y_test,
    y_scores,
    title="ROC Curve (SGD Scratch)",
    filename="roc_curve_sgd_scratch.png",
)
plot_precision_recall_curve(y_test, y_scores)

 ## Conclusion

 The SGD model achieved accuracy {acc:.4f}, F1 {f1:.4f}, and ROC AUC {roc_auc:.4f}. Visualizations show classification performance.