In [4]:
import pandas as pd
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
'''
Full dataset description: https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset/data
'''

data = pd.read_csv('data/UCI_Credit_Card_600_samples.csv')
target_col = 'default.payment.next.month'

In [38]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=[target_col])  
y = data[target_col] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=1,
    stratify=y
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Logistic Regression model
model = LogisticRegression(penalty=None)
model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Print evaluation metrics
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Train Precision: {precision_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Train Recall: {recall_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted'):.4f}")


In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the polynomial features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Initialize and train Logistic Regression model
model = LogisticRegression(penalty=None)
model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Print evaluation metrics
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Train Precision: {precision_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Train Recall: {recall_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted'):.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Generate polynomial features (degree = 3)
poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the polynomial features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

# Define parameter grid for logistic regression
param_grid = {
    "penalty": ["l1", "l2"],  # Regularization type
    "C": [0.001, 0.005, 0.01, 0.1, 1],  # Regularization strength
}

# Perform Grid Search
grid_search = GridSearchCV(
    LogisticRegression(),
    param_grid,
    scoring="accuracy",  
    cv=3, 
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

# Get best model and parameters
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on training and test sets
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Print evaluation metrics
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Train Precision: {precision_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Train Recall: {recall_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted'):.4f}")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define pipeline: Polynomial transformation -> Scaling -> Logistic Regression
pipeline = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),  
    ("scaler", StandardScaler()),  
    ("logreg", LogisticRegression())
])

# Define parameter grid
param_grid = {
    "poly__degree": [2, 3],  # Tune polynomial degree
    "logreg__penalty": ["l1", "l2"],  # L1/L2 regularization
    "logreg__C": [0.001, 0.01, 0.1, 1],  # Regularization strength
}

# Run Grid Search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="accuracy",  # Only optimize accuracy
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search
grid_search.fit(X, y)

# Print best parameters and best accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on training and test sets
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Print evaluation metrics
print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Train Precision: {precision_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Train Recall: {recall_score(y_train, y_train_pred, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted'):.4f}")
