# 4. Logistic Regression

## Loading Image datasetset

In [None]:
import numpy as np

# Load the .npz file
data = np.load('dataset_features.npz')

# List all arrays within the .npz file
print(data.files)

# Access individual arrays by their names
X_train = data['trainset_features']
y_train = data['trainset_labels']

X_val = data['validset_features']
y_val = data['validset_labels']

X_test = data['testset_features']
y_test = data['testset_labels']

class_labels = data['class_labels']

In [None]:
def decode_class(y):
    return np.argmax(y,axis=1)

## Model training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a k-NN pipeline
logreg_pipe = Pipeline(
    [("scaler", StandardScaler()), 
     ("logreg", LogisticRegression(multi_class='multinomial', solver='saga', penalty='none'))]
)

# Fit it to train data
logreg_pipe.fit(X_train, decode_class(y_train))

## Model scores

In [None]:
# Accuracy on train, validation and test sets
print('Model Accuracy:')
print(f'On train set: {logreg_pipe.score(X_train, decode_class(y_train)):.3f}')
print(f'On valid set: {logreg_pipe.score(X_val, decode_class(y_val)):.3f}')
print(f'On test  set: {logreg_pipe.score(X_test, decode_class(y_test)):.3f}')

## Visualization of model coefficients

In [None]:
coefficients = logreg_pipe.named_steps['logreg'].coef_ 
coefficients.shape

Visualize the coefficients as a heatmap

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(np.abs(coefficients), annot=False, cbar=True)
ax.set_yticklabels(class_labels, rotation=0)
ax.set_xlabel("Feature Index")
plt.title("Logistic Regression Coefficients Heatmap")

Find the indices of the top 5 largest coefficients (absolute values) for each class

In [None]:
import pandas as pd

top_features = {}
for class_index, class_coefficients in enumerate(coefficients):
    # Get the indices of the top 5 largest coefficients for the current class
    largest_indices = np.argsort(-np.abs(class_coefficients))[:5]
    top_features[f"{class_labels[class_index]}"] = largest_indices


top_features_df = pd.DataFrame.from_dict(top_features, orient='index', columns=[f"Feature {i+1}" for i in range(5)])
print("Top 5 Largest Coefficients for Each Class (Feature Indices):")
print(top_features_df)

## Model regularization

In [None]:
X_crossval = np.concatenate((X_train, X_val), axis=0)
y_crossval = decode_class(np.concatenate((y_train, y_val), axis=0))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define logistic regression model with L2 regularization
log_reg = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, multi_class='multinomial')

# Define grid of regularization strengths to test
param_grid = {
    'C': np.logspace(-6, 3, 20)  # Test values from 10^-4 to 10^4
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='accuracy',  # Metric for evaluating models
    cv=5,                # 5-fold cross-validation
    n_jobs=-1,           # Use all available processors
    return_train_score=True
)

# Perform grid search
grid_search.fit(X_crossval, y_crossval)

In [None]:
# Extract results from the GridSearchCV object
results = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for interpretation
results_df = results[
    [
        'param_C',  # Regularization strength parameter
        'mean_train_score',  # Mean training score across folds
        'std_train_score',  # Standard deviation of training score across folds
        'mean_test_score',  # Mean validation (test) score across folds
        'std_test_score'  # Standard deviation of validation score across folds
    ]
]

# Sort by the validation score for better interpretability
results_df = results_df.sort_values(by='mean_test_score', ascending=False)
results_df

- **mean_train_score** and **std_train_score** focus on the model's fit to the training data.
- **mean_test_score** and **std_test_score** assess the model's ability to generalize to unseen data.
    
    Consistency and Stability:
        Low standard deviations (**std_train_score** and **std_test_score**) indicate consistent performance.
        High mean validation scores with low standard deviations are desirable for a robust, well-generalizing model.

These metrics help diagnose overfitting, underfitting, or data-related issues during hyperparameter tuning.

In [None]:
# Assuming you have metrics recorded during training, such as accuracy or loss
# Example data (replace with actual metrics from your training process)

results_df = results_df.sort_values(by='param_C')

param_c = results_df['param_C'].tolist()
train_scores_mean = results_df['mean_train_score'].to_numpy()
val_scores_mean = results_df['mean_test_score'].to_numpy()
train_scores_std = results_df['std_train_score'].to_numpy()
val_scores_std = results_df['std_test_score'].to_numpy()
# Replace with validation accuracies

# Plot the curves
plt.figure(figsize=(10, 6))
plt.plot(param_c, train_scores_mean, label="Training Accuracy", marker="o")
plt.plot(param_c, val_scores_mean, label="Validation Accuracy", marker="o")
plt.fill_between(param_c, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2)
plt.fill_between(param_c, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.2)
plt.title("Training and Validation Accuracy")
plt.xlabel("C Parameter")
plt.ylabel("Accuracy")
plt.xscale('log')
plt.legend()
plt.grid()

We observe that for values of C > 0.001 the model starts to overfitting and not gaining anymore capability to generalize. This is indicated by a gap showing in the model accuracy of the training and validation sets.

In [None]:
# Create a k-NN pipeline
logreg_pipe_tuned = Pipeline(
    [("scaler", StandardScaler()), 
     ("logreg", LogisticRegression(multi_class='multinomial', solver='saga', penalty='none', C=0.005))]
)

# Fit it to train data
logreg_pipe.fit(X_train, decode_class(y_train))

## Tune model Scores

In [None]:
# Accuracy on train, validation and test sets
print('Model Accuracy:')
print(f'On train set: {logreg_pipe_tuned.score(X_train, decode_class(y_train)):.3f}')
print(f'On valid set: {logreg_pipe_tuned.score(X_val, decode_class(y_val)):.3f}')
print(f'On test  set: {logreg_pipe_tuned.score(X_test, decode_class(y_test)):.3f}')