In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("class_all_with_chronic_names.csv")
df = df[df['class'].isin([1, 2])]
df['class'] = df['class'].map({1: 0, 2: 1})
y = df['class']

# Drop ID columns
drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY', 'DIAG_TYPE']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

X = df.drop(columns=['class'])

# Label encode
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Impute missing
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)

# Scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=100)  # keep top 100 features
X_selected = selector.fit_transform(X_poly, y)

# Train/Validation/Test split
X_train_val, X_test, y_train_val, y_test = train_test_split(X_selected, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)

# Grid Search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['elasticnet'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'solver': ['saga'],  # saga supports elasticnet
    'class_weight': [None, 'balanced']
}

grid_lr = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2
)

grid_lr.fit(X_train_val, y_train_val)

best_lr = grid_lr.best_estimator_
print("🔍 Best Parameters:", grid_lr.best_params_)

# Evaluate helper
def evaluate(model, X, y, label):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    print(f"\n📊 {label} Set Classification Report:")
    print(classification_report(y, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"🎯 ROC-AUC: {roc_auc_score(y, y_prob):.4f}")
    return y, y_prob

# Evaluate
y_train_true, y_train_prob = evaluate(best_lr, X_train, y_train, "Train")
y_val_true, y_val_prob = evaluate(best_lr, X_val, y_val, "Validation")
y_test_true, y_test_prob = evaluate(best_lr, X_test, y_test, "Test")

# Plot ROC Curve for Train, Validation, and Test sets
plt.figure(figsize=(8, 6))

# Train
fpr_train, tpr_train, _ = roc_curve(y_train_true, y_train_prob)
auc_train = roc_auc_score(y_train_true, y_train_prob)
plt.plot(fpr_train, tpr_train, label=f"Train AUC = {auc_train:.2f}", linestyle='-')

# Validation
fpr_val, tpr_val, _ = roc_curve(y_val_true, y_val_prob)
auc_val = roc_auc_score(y_val_true, y_val_prob)
plt.plot(fpr_val, tpr_val, label=f"Validation AUC = {auc_val:.2f}", linestyle='--')

# Test
fpr_test, tpr_test, _ = roc_curve(y_test_true, y_test_prob)
auc_test = roc_auc_score(y_test_true, y_test_prob)
plt.plot(fpr_test, tpr_test, label=f"Test AUC = {auc_test:.2f}", linestyle='-.')

# Final plot settings
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression (Train, Val, Test)")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()

