# Logistic Regression Modeling Pipeline
This notebook performs logistic regression with hyperparameter tuning, evaluation, and logging to Excel.

In [13]:
# --- Standard Libraries ---
import os
import time
import json
from datetime import datetime

# --- Data Handling ---
import pandas as pd
import numpy as np

# --- Scikit-learn ---
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    classification_report,
    make_scorer,
    f1_score,
)
from sklearn.preprocessing import StandardScaler

# --- Model Persistence ---
import joblib

# --- Excel Handling ---
from openpyxl import load_workbook, Workbook

# --- Configuration ---
PROCESSED_DATA_DIR = "../data/processed/"
REPORTS_DIR = "../reports/"
MODELS_DIR = "../models/"
EXCEL_LOG_PATH = os.path.join(REPORTS_DIR, "model_performance_summary.xlsx")
MODEL_SAVE_PATH = os.path.join(MODELS_DIR, "logistic_regression_model.joblib")

os.makedirs(REPORTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

# --- Reproducibility ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Helper Function: Log Results to Excel
This function logs hyperparameter tuning and model results to an Excel sheet.

In [None]:
def log_to_excel(
    excel_path,
    sheet_name="Model_Log",
    log_entry=None,
    trial_entries=None,
):
    entries_to_add = []
    if trial_entries:
        entries_to_add.extend(trial_entries)
    if log_entry:
        entries_to_add.append(log_entry)

    if not entries_to_add:
        print("No entries to log.")
        return

    df_new_logs = pd.DataFrame(entries_to_add)

    try:
        book = load_workbook(excel_path)
        if sheet_name in book.sheetnames:
            sheet = book[sheet_name]
            header = [cell.value for cell in sheet[1]]
            df_new_logs = df_new_logs.reindex(columns=header)
            for _, row in df_new_logs.iterrows():
                sheet.append(row.tolist())
        else:
            sheet = book.create_sheet(sheet_name)
            sheet.append(df_new_logs.columns.tolist())
            for _, row in df_new_logs.iterrows():
                sheet.append(row.tolist())
        book.save(excel_path)
    except FileNotFoundError:
        with pd.ExcelWriter(excel_path, engine="openpyxl", mode="w") as writer:
            df_new_logs.to_excel(writer, sheet_name=sheet_name, index=False)
    print(f"Logged {len(entries_to_add)} entries to {excelpdated Notebook with Commentary (continue from where it left off)_path}, sheet '{sheet_name}'")

## Load or Simulate Data
This section loads preprocessed data or simulates it using `make_classification`.

In [16]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                           n_redundant=5, random_state=RANDOM_STATE,
                           weights=[0.8, 0.2])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=RANDOM_STATE, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Loaded and Scaled:")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"Class distribution in y_train: {np.bincount(y_train)}")

Data Loaded and Scaled:
X_train_scaled shape: (800, 20)
y_train shape: (800,)
Class distribution in y_train: [639 161]


## Hyperparameter Tuning via GridSearchCV
Tune logistic regression with different solvers and regularization strengths.

In [17]:
param_grid = [
    {"solver": ["liblinear"], "C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1", "l2"], "multi_class": ["ovr"]},
    {"solver": ["lbfgs", "saga"], "C": [0.01, 0.1, 1, 10, 100], "penalty": ["l2"], "multi_class": ["auto"]},
    {"solver": ["saga"], "C": [0.01, 0.1, 1, 10, 100], "penalty": ["l1"], "multi_class": ["auto"]}
]

log_reg_base = LogisticRegression(
    random_state=RANDOM_STATE,
    class_weight="balanced",
    max_iter=5000
)

grid_search = GridSearchCV(
    estimator=log_reg_base,
    param_grid=param_grid,
    scoring="f1_weighted",
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Best Params: {grid_search.best_params_}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits




Best Score: 0.8335
Best Params: {'C': 0.1, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'saga'}




## Log All Grid Search Trials to Excel

In [18]:
trial_log_entries = []
cv_results = grid_search.cv_results_

for i in range(len(cv_results["params"])):
    params = cv_results["params"][i]
    if params.get("solver") == "liblinear" and "multi_class" not in params:
        params["multi_class"] = "ovr"

    trial_log_entries.append({
        "Model Name": "Logistic Regression - Trial",
        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "Hyperparameters": json.dumps(params),
        "CV Score (Mean Test f1_weighted)": cv_results["mean_test_score"][i],
        "Mean Fit Time (s)": cv_results["mean_fit_time"][i],
        "Std Dev CV Score": cv_results["std_test_score"][i],
        "Notes/Observations": "GridSearchCV trial run."
    })

log_to_excel(EXCEL_LOG_PATH, trial_entries=trial_log_entries)

Logged 25 entries to ../reports/model_performance_summary.xlsx, sheet 'Model_Log'


## Train Final Model Using Best Hyperparameters

In [21]:
final_model_params = grid_search.best_params_

if final_model_params.get("solver") == "liblinear" and "multi_class" not in final_model_params:
    final_model_params["multinomial"] = "ovr"

final_log_reg = LogisticRegression(
    **final_model_params,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    max_iter=5000
)

final_log_reg.fit(X_train_scaled, y_train)



## Evaluate Final Model on Train and Test Sets

In [20]:
# --- Predictions ---
y_train_pred = final_log_reg.predict(X_train_scaled)
y_test_pred = final_log_reg.predict(X_test_scaled)
y_train_prob = final_log_reg.predict_proba(X_train_scaled)
y_test_prob = final_log_reg.predict_proba(X_test_scaled)

# --- Metrics ---
def evaluate(y_true, y_pred, y_prob):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    roc_auc = roc_auc_score(y_true, y_prob[:, 1]) if len(np.unique(y_true)) == 2 else None
    return precision, recall, f1, roc_auc

train_metrics = evaluate(y_train, y_train_pred, y_train_prob)
test_metrics = evaluate(y_test, y_test_pred, y_test_prob)

print("Training Metrics:")
print(f"Precision: {train_metrics[0]:.4f}, Recall: {train_metrics[1]:.4f}, F1: {train_metrics[2]:.4f}, ROC AUC: {train_metrics[3]:.4f}")
print("\nTesting Metrics:")
print(f"Precision: {test_metrics[0]:.4f}, Recall: {test_metrics[1]:.4f}, F1: {test_metrics[2]:.4f}, ROC AUC: {test_metrics[3]:.4f}")

Training Metrics:
Precision: 0.8687, Recall: 0.8275, F1: 0.8391, ROC AUC: 0.9048

Testing Metrics:
Precision: 0.7989, Recall: 0.7250, F1: 0.7479, ROC AUC: 0.7992
