In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)

from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier

# === Load and Prepare Dataset ===
df = pd.read_csv("E:\PXA252_BH\OlderFiles20250512\class_all_with_chronic_names.csv")
df = df[df['class'].isin([1, 2, 3])]
df['class'] = df['class'].replace({1: 0, 2: 1, 3: 1})
y = df['class']

drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY', 'DIAG_TYPE']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
X = df.drop(columns=['class'])

# Encode categorical columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Impute missing values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = pd.DataFrame(SimpleImputer(strategy="mean").fit_transform(X), columns=X.columns)

# Feature selection using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
top_features = pd.Series(rf.feature_importances_, index=X.columns).nlargest(30).index
X = X[top_features]
cat_cols = [col for col in cat_cols if col in X.columns]

# === Stratified Train-Val-Test Split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)

# === Undersample Class 1 in Training ===
rus = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

# Convert categorical columns to string
for col in cat_cols:
    X_train_bal[col] = X_train_bal[col].astype(str)
    X_val[col] = X_val[col].astype(str)
    X_test[col] = X_test[col].astype(str)

cat_features = [X_train_bal.columns.get_loc(col) for col in cat_cols]

# Compute class weights
class_weights_values = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_bal), y=y_train_bal)
class_weights = {i: w for i, w in enumerate(class_weights_values)}

# === Train CatBoost Model with Regularization ===
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.03,
    depth=5,
    l2_leaf_reg=8,
    eval_metric='F1',
    random_seed=42,
    cat_features=cat_features,
    early_stopping_rounds=50,
    class_weights={0: 1.1, 1: 1},
    verbose=100
)


model.fit(X_train_bal, y_train_bal, eval_set=(X_val, y_val), use_best_model=True)

# === Evaluation Function ===
def evaluate(model, X, y, label):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    print(f"\n📊 {label} Classification Report:")
    print(classification_report(y, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"🎯 ROC AUC: {roc_auc_score(y, y_prob):.4f}")
    return y, y_prob

# Convert categorical features in all datasets to string before evaluation
for col in cat_cols:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Evaluate
y_train_true, y_train_prob = evaluate(model, X_train, y_train, "Train")
y_val_true, y_val_prob = evaluate(model, X_val, y_val, "Validation")
y_test_true, y_test_prob = evaluate(model, X_test, y_test, "Test")

# === Threshold Optimization (F1-Based) ===
y_val_prob_inv = 1 - y_val_prob
prec, rec, thresh = precision_recall_curve(y_val_true == 0, y_val_prob_inv)

# Select threshold that maximizes class 0 recall, while keeping class 0 precision > 0.5
best_thresh = 0.5
max_recall = 0
for p, r, t in zip(prec, rec, np.append(thresh, 1.0)):
    if p > 0.5 and r > max_recall:
        max_recall = r
        best_thresh = 1 - t  # Invert back

print(f"🔧 Class 0 Recall-Optimized Threshold: {best_thresh:.3f}")
# === Evaluate on Test Set using this threshold
y_test_pred_custom = (y_test_prob >= best_thresh).astype(int)

print("\n📊 Test Classification Report (Class 0 Recall-Based Threshold):")
print(classification_report(y_test_true, y_test_pred_custom))


# === Evaluate on Test Set with Custom Threshold
y_test_pred_custom = (y_test_prob > optimal_thresh).astype(int)
print("\n📊 Test Classification Report (Custom Threshold):")
print(classification_report(y_test_true, y_test_pred_custom))

# === Confusion Matrix
cm = confusion_matrix(y_test_true, y_test_pred_custom)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.title("Confusion Matrix - Test (Custom Threshold)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# === ROC Curve
plt.figure(figsize=(8, 6))
for y_true, y_prob, name in zip(
    [y_train_true, y_val_true, y_test_true],
    [y_train_prob, y_val_prob, y_test_prob],
    ['Train', 'Validation', 'Test']
):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc = roc_auc_score(y_true, y_prob)
    plt.plot(fpr, tpr, label=f"{name} AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves - CatBoost")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
