In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, accuracy_score, roc_auc_score,
    precision_recall_curve, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt
import seaborn as sns

# === Load and Prepare Dataset ===
df = pd.read_csv("class_all_with_chronic_names.csv")
df = df[df['class'].isin([1, 2, 3])]
df['class'] = df['class'].replace({1: 0, 2: 1, 3: 1})
y = df['class']
drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY', 'DIAG_TYPE']
X = df.drop(columns=drop_cols + ['class'], errors='ignore')

# === Encode Categorical ===
cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# === Impute & Select Features ===
X = pd.DataFrame(SimpleImputer(strategy="mean").fit_transform(X), columns=X.columns)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
top_features = pd.Series(rf.feature_importances_, index=X.columns).nlargest(40).index.tolist()
X = X[top_features]

# === Train-Validation-Test Split ===
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)

# === Compute Class Weights and Scale_Pos_Weight ===
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
sample_weights = np.where(y_train == 0, class_weights[0], class_weights[1])
num_class0 = sum(y_train == 0)
num_class1 = sum(y_train == 1)
scale_pos_weight = num_class0 / num_class1

# === Train XGBoost Model ===
base_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.6,
    colsample_bytree=0.6,
    reg_lambda=10,
    reg_alpha=5,
    scale_pos_weight=scale_pos_weight,
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)
base_model.fit(X_train, y_train, sample_weight=sample_weights)

# === Calibrate Model ===
calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv='prefit')
calibrated_model.fit(X_val, y_val)

# === Evaluation Function ===
def evaluate(model, X, y, label, threshold=0.636):
    y_prob = model.predict_proba(X)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)
    print(f"\n📊 {label} Report (Threshold={threshold:.3f}):")
    print(classification_report(y, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"🎯 ROC AUC: {roc_auc_score(y, y_prob):.4f}")
    return y, y_prob, y_pred

# === Evaluate with Threshold = 0.636 ===
THRESH = 0.636
y_train_true, y_train_prob, _ = evaluate(calibrated_model, X_train, y_train, "Train", THRESH)
y_val_true, y_val_prob, _ = evaluate(calibrated_model, X_val, y_val, "Validation", THRESH)
y_test_true, y_test_prob, y_test_pred = evaluate(calibrated_model, X_test, y_test, "Test", THRESH)

# === Confusion Matrix ===
cm = confusion_matrix(y_test_true, y_test_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-Worsening', 'Worsening'],
            yticklabels=['Non-Worsening', 'Worsening'])
plt.title(f"Confusion Matrix - XGBoost)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()
def plot_roc(y_true, y_prob, label):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc_score = roc_auc_score(y_true, y_prob)
    plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}")

# === Plot ROC for All Sets ===
plt.figure(figsize=(8, 6))
plot_roc(y_train_true, y_train_prob, "Train")
plot_roc(y_val_true, y_val_prob, "Validation")
plot_roc(y_test_true, y_test_prob, "Test")
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()