In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv("class_all_with_chronic_names.csv")
df = df[df['class'].isin([1, 2])]
df['class'] = df['class'].map({1: 0, 2: 1})
y = df['class']

# Drop irrelevant columns
drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY', 'DIAG_TYPE']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
X = df.drop(columns=['class'])

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Impute missing and remove inf
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)

# Select top 30 features using Random Forest
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X, y)
top_30_features = pd.Series(rf_temp.feature_importances_, index=X.columns).sort_values(ascending=False).head(30).index.tolist()
X = X[top_30_features]

# Scale features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train/validation/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Define improved model
def build_stronger_model(input_dim):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.5))
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.4))
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.3))
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build and train the model
model = build_stronger_model(X_train.shape[1])
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=10, verbose=1)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=32,
                    class_weight=class_weights, callbacks=[early_stop, reduce_lr], verbose=1)

# Evaluation
def evaluate_dnn(model, X, y, label):
    prob = model.predict(X).flatten()
    pred = (prob > 0.5).astype(int)
    print(f"\n📊 {label} Classification Report:")
    print(classification_report(y, pred))
    print(f"✅ Accuracy: {accuracy_score(y, pred):.4f}")
    print(f"🎯 ROC-AUC: {roc_auc_score(y, prob):.4f}")
    return y, prob

y_train_true, y_train_prob = evaluate_dnn(model, X_train, y_train, "Train")
y_val_true, y_val_prob = evaluate_dnn(model, X_val, y_val, "Validation")
y_test_true, y_test_prob = evaluate_dnn(model, X_test, y_test, "Test")

# Plot ROC curves
plt.figure(figsize=(8, 6))
for y_true, y_prob, label in zip(
    [y_train_true, y_val_true, y_test_true],
    [y_train_prob, y_val_prob, y_test_prob],
    ['Train', 'Validation', 'Test']
):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc = roc_auc_score(y_true, y_prob)
    plt.plot(fpr, tpr, label=f"{label} AUC = {auc:.2f}")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("ROC Curves - Improved DNN")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

# Confusion matrix (test set)
y_test_pred = (y_test_prob > 0.5).astype(int)
cm = confusion_matrix(y_test_true, y_test_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Test Set (Improved DNN)")
plt.tight_layout()
plt.show()
