# Logistic Regression Model Implementation

**Objective:** Implement, train, evaluate, and tune a Logistic Regression model for predicting severe traffic accidents.

**Prerequisites:**
- Preprocessed data file (`data/processed/preprocessed_data.csv`).
- Utility functions from `src/modeling_utils.py` and `src/preprocessing_utils.py`.

**Key Libraries Imported:**
- Pandas
- NumPy
- Scikit-learn
- Matplotlib
- Seaborn

In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import time
import json
from modeling_utils import init_performance_excel, append_performance_record, init_performance_excel
from preprocessing_utils import load_raw_data, drop_outcome_columns, drop_identifiers, parse_datetime, extract_temporal_features, impute_missing_categorical, encode_categorical, parse_desc_features, save_preprocessed

# Set random seed for reproducibility
np.random.seed(42)

# Conditional path logic for data file
if 'COLAB_GPU' in os.environ:
    # Running in Google Colab
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
    except ImportError:
        print("Google Colab module not available. Ensure this code is running in Google Colab.")
    data_path = '/content/drive/MyDrive/Colab_Notebooks/TrafficAccidentSeverity/data/processed/preprocessed_data.csv'
else:
    # Running locally (Jupyter Lab or other)
    data_path = '../data/processed/preprocessed_data.csv'

try:
    df = pd.read_csv(data_path)
    print(f"Successfully loaded data from: {data_path}")
    from modeling_utils import init_performance_excel, append_performance_record
except FileNotFoundError:
    print(f"Error: Data file not found at {data_path}")
    if 'COLAB_GPU' in os.environ:
        print("Please ensure the file exists in your Google Drive at the specified path and that Drive is mounted.")
    else:
        print("Please ensure the file exists at the specified relative path for your local environment.")

Successfully loaded data from: ../data/processed/preprocessed_data.csv


## Data Preparation

In [16]:
# Define features and target variable
X = df.drop(['SEVERITY'], axis=1)  # Drop only target variable as datetime has already been processed
y = df['SEVERITY']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Encode categorical features
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align columns of test set with training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

## Model Training & Evaluation

In [17]:
# Initialize Logistic Regression model
log_reg = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')

# Train the model
log_reg.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_scaled)
y_prob_all_classes = log_reg.predict_proba(X_test_scaled)

# Evaluate the model
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Calculate ROC AUC score robustly for binary or multiclass cases
if y_prob_all_classes.shape[1] == 2:  # Binary classification
	# roc_auc_score expects probabilities of the positive class
	roc_auc = roc_auc_score(y_test, y_prob_all_classes[:, 1])
else:  # Multiclass classification
	roc_auc = roc_auc_score(y_test, y_prob_all_classes, multi_class='ovr', average='weighted')

print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc:.4f}")



              precision    recall  f1-score   support

           0       0.93      1.00      0.96      4114
           1       0.00      0.00      0.00       301

    accuracy                           0.93      4415
   macro avg       0.47      0.50      0.48      4415
weighted avg       0.87      0.93      0.90      4415

ROC AUC Score: 0.6779


## Hyperparameter Tuning

In [19]:
# Define parameter grids for different solvers
param_grid = [
    {
        'solver': ['liblinear'], # Solvers that don't need high max_iter and support L1/L2
        'C': [0.01, 0.1, 1, 10, 100]
        # Note: liblinear can be faster for small data, but only handles L1/L2
    },
    {
        'solver': ['lbfgs', 'newton-cg'], # General-purpose solvers, often converge fast
        'C': [0.01, 0.1, 1, 10, 100]
        # max_iter is usually not the bottleneck for these unless the problem is very complex
    },
    {
        'solver': ['sag', 'saga'], # Solvers for larger datasets, require more iterations
        'C': [0.01, 0.1, 1, 10, 100],
        'max_iter': [1000, 5000, 10000] # Add higher max_iter values for sag/saga
        # You might need even higher values depending on your data size and complexity.
        # Let's also explicitly set the tolerance, although increasing max_iter is usually sufficient.
        # 'tol': [1e-4, 1e-3] # Optional: Decrease tolerance for stricter convergence if needed
    }
]
# Define alternate param_grid for fastest iteration meant for testing that can be trained for just few seconds
param_grid_fast = [
    {
        'solver': ['liblinear'], # Solvers that don't need high max_iter and support L1/L2
        'C': [0.01, 0.1, 1]
        # Note: liblinear can be faster for small data, but only handles L1/L2
    },
    {
        'solver': ['lbfgs', 'newton-cg'], # General-purpose solvers, often converge fast
        'C': [0.01, 0.1, 1]
        # max_iter is usually not the bottleneck for these unless the problem is very complex
    },
    {
        'solver': ['sag', 'saga'], # Solvers for larger datasets, require more iterations
        'C': [0.01, 0.1, 1],
        'max_iter': [100] # Add higher max_iter values for sag/saga
        # You might need even higher values depending on your data size and complexity.
        # Let's also explicitly set the tolerance, although increasing max_iter is usually sufficient.
        # 'tol': [1e-4, 1e-3] # Optional: Decrease tolerance for stricter convergence if needed
    }
]


# Initialize GridSearchCV with the updated param_grid (list of dicts)
grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid, # param_grid is now a list of dicts
    cv=5,
    scoring='roc_auc_ovr', # Keep the multi-class scoring from the previous step
    n_jobs=-1
)

# Record start time
start_time = time.time()

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Record end time
end_time = time.time()

# Calculate training time
training_time = end_time - start_time

# Get best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best CV ROC AUC Score (ovr): {best_score:.4f}")
print(f"Training Time: {training_time:.2f} seconds")




Best Parameters: {'C': 0.01, 'solver': 'liblinear'}
Best CV ROC AUC Score (ovr): 0.6527
Training Time: 296.38 seconds


## Logging Hyperparameter Trials & Results
> with xlsx file

In [20]:
# Define the Excel file path
excel_filepath = '../reports/model_performance_summary.xlsx'

# Initialize the Excel file if it doesn't exist
init_performance_excel(excel_filepath)

# Prepare the record to append
record = {
    'Model_Name': 'Logistic Regression',
    'Hyperparameter_Set_Tried': json.dumps(param_grid),
    'CV_Score_for_Set': best_score,
    'Selected_Final_Hyperparameters': json.dumps(best_params),
    'Training_Time_Seconds': training_time,
    'Train_Precision': precision,
    'Train_Recall': recall,
    'Train_F1': f1,
    'Train_ROC_AUC': roc_auc
}

# Append the record to the Excel file
append_performance_record(excel_filepath, record)

  df = pd.concat([df, pd.DataFrame([record])], ignore_index=True)


## Saving the Best Model

In [12]:
import joblib

# Define the model save path
model_save_path = '../models/logistic_regression_best_model.pkl'

# Save the best model
joblib.dump(grid_search.best_estimator_, model_save_path)

print(f"Best Logistic Regression model saved to: {model_save_path}")

Best Logistic Regression model saved to: ../models/logistic_regression_best_model.pkl
