 # Comparison of SGD for Logistic Regression: From Scratch vs Scikit-learn

 ## Introduction

 Compare SGD implementations for Logistic Regression.

 ## Data Loading

 Load  the breast cancer data.

In [None]:
import os
import sys

import numpy as np

# Set project root directory and add it to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
sys.path.append(project_root)


from src.scratch.utils.viz_utils import plot_scatter_for_classification

X_train = np.load("../../../data/processed/breast_cancer_X_train.npy")
X_test = np.load("../../../data/processed/breast_cancer_X_test.npy")
y_train = np.load("../../../data/processed/breast_cancer_y_train.npy")
y_test = np.load("../../../data/processed/breast_cancer_y_test.npy")

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Test target shape:", y_test.shape)

 ## Exploratory Data Analysis

 Visualize two features.

In [None]:
if X_train.shape[1] >= 2:
    plot_scatter_for_classification(X_train, y_train, feature_indices=(0, 1), title="Feature 1 vs Feature 2", filename="feature1_vs_feature2_sgd_comp.png")


 ## Training Both Models

 Train both models.

In [None]:
from src.scratch.models.logistic_regression import LogisticRegression
from src.sklearn_impl.logistic_regression_sk import LogisticRegressionSK
import time

model_scratch = LogisticRegression(method='stochastic_gd', learning_rate=0.01, n_iterations=1000)
start_time = time.time()
model_scratch.fit(X_train, y_train)
time_scratch = time.time() - start_time

model_sk = LogisticRegressionSK(method='sgd', learning_rate=0.01, n_iterations=1000)
start_time = time.time()
model_sk.fit(X_train, y_train)
time_sk = time.time() - start_time


 ## Performance Metrics

 Compare metrics.

In [None]:
from src.scratch.utils.metrics import accuracy, precision_score, recall_score, f1_score, roc_auc_score
from src.scratch.utils.math_utils import sigmoid

y_pred_scratch = model_scratch.predict(X_test)
y_scores_scratch = sigmoid(model_scratch.predict(X_test))

y_pred_sk = model_sk.predict(X_test)
y_scores_sk = model_sk.predict_proba(X_test)[:, 1]

acc_scratch = accuracy(y_test, y_pred_scratch)
f1_scratch = f1_score(y_test, y_pred_scratch)
roc_auc_scratch = roc_auc_score(y_test, y_scores_scratch)

acc_sk = accuracy(y_test, y_pred_sk)
f1_sk = f1_score(y_test, y_pred_sk)
roc_auc_sk = roc_auc_score(y_test, y_scores_sk)

print(f"From Scratch - Accuracy: {acc_scratch:.4f}, F1: {f1_scratch:.4f}, ROC AUC: {roc_auc_scratch:.4f}, Time: {time_scratch:.4f} seconds")
print(f"Scikit-learn - Accuracy: {acc_sk:.4f}, F1: {f1_sk:.4f}, ROC AUC: {roc_auc_sk:.4f}, Time: {time_sk:.4f} seconds")


 ## Visual Comparison

 Visualize differences.

In [None]:
from src.scratch.utils.viz_utils import plot_learning_curve, plot_two_decision_boundaries, plot_confusion_matrix

plot_learning_curve(model_scratch.get_loss_history(), title="Learning Curve (SGD Scratch)", filename="learning_curve_sgd_scratch_comp_log.png")
plot_learning_curve(model_sk.get_loss_history(), title="Learning Curve (SGD SK)", filename="learning_curve_sgd_sk_comp_log.png")
if X_test.shape[1] == 2:
    plot_two_decision_boundaries(model_scratch, model_sk, X_test, y_test, labels=["Scratch", "SK"], filename="decision_boundaries_sgd_comp.png")
plot_confusion_matrix(y_test, y_pred_scratch, title="Confusion Matrix (SGD Scratch)", filename="confusion_matrix_sgd_scratch_comp.png")
plot_confusion_matrix(y_test, y_pred_sk, title="Confusion Matrix (SGD SK)", filename="confusion_matrix_sgd_sk_comp.png")


 ## Insights

 Scikit-learn's SGD typically offers better performance and efficiency due to optimized algorithms.