In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess data
df = pd.read_csv("class_all_with_chronic_names.csv")
df = df[df['class'].isin([1, 2])]
df['class'] = df['class'].map({1: 0, 2: 1})
drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY', 'DIAG_TYPE']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
X = df.drop(columns=['class'])
y = df['class']

for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)

# Feature selection: top 30
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X, y)
top_30_features = pd.Series(rf_temp.feature_importances_, index=X.columns).sort_values(ascending=False).head(30).index.tolist()
X = X[top_30_features]

# Split into train, val, test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

best_rf = grid_search.best_estimator_
print("✅ Best Hyperparameters:", grid_search.best_params_)

# Evaluation helper
def evaluate(model, X, y, label):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    print(f"\n📊 {label} Set Classification Report:")
    print(classification_report(y, y_pred))
    print(f"✅ Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"🎯 ROC-AUC: {roc_auc_score(y, y_prob):.4f}")
    return y, y_prob

# Evaluate on all splits
y_train_true, y_train_prob = evaluate(best_rf, X_train, y_train, "Train")
y_val_true, y_val_prob = evaluate(best_rf, X_val, y_val, "Validation")
y_test_true, y_test_prob = evaluate(best_rf, X_test, y_test, "Test")

# Plot ROC Curves for Train, Validation, and Test sets
plt.figure(figsize=(8, 6))

# Train ROC
fpr_train, tpr_train, _ = roc_curve(y_train_true, y_train_prob)
auc_train = roc_auc_score(y_train_true, y_train_prob)
plt.plot(fpr_train, tpr_train, label=f"Train AUC = {auc_train:.2f}", linestyle='-')

# Validation ROC
fpr_val, tpr_val, _ = roc_curve(y_val_true, y_val_prob)
auc_val = roc_auc_score(y_val_true, y_val_prob)
plt.plot(fpr_val, tpr_val, label=f"Validation AUC = {auc_val:.2f}", linestyle='--')

# Test ROC
fpr_test, tpr_test, _ = roc_curve(y_test_true, y_test_prob)
auc_test = roc_auc_score(y_test_true, y_test_prob)
plt.plot(fpr_test, tpr_test, label=f"Test AUC = {auc_test:.2f}", linestyle='-.')

# Plot Settings
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest on Train, Validation, Test")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()