In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic MIMIC data ===
df_synthetic = pd.read_csv("generated_data_Our_prompts_MIMIC.csv")  # Your binarized MIMIC file

# === Sensitive attribute ===
sensitive_attr = "race"

# === Decode byte strings if needed ===
for col in df_synthetic.columns:
    if df_synthetic[col].dtype == object:
        df_synthetic[col] = df_synthetic[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=["los_seconds"])
y = df_synthetic["los_seconds"].astype(int)
race = df_synthetic[sensitive_attr]

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Convert categorical columns to numeric ===
for df_ in [X_train, X_test]:
    for col in df_.select_dtypes(include=["object", "bool"]).columns:
        df_[col] = pd.factorize(df_[col])[0]

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df["predicted_label"] = preds
    out_df["race"] = race_test.values
    out_df["true_label"] = y_test.values
    out_df.to_csv(f"Our_TS_MIMIC_predictions_{name}.csv", index=False)

print("\n✅ Saved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.8322  Prec=0.9131  Rec=0.8972  AUROC=0.5972
Logistic_Regression  Acc=0.8946  Prec=0.8943  Rec=1.0000  AUROC=0.6867
Random_Forest        Acc=0.8985  Prec=0.9034  Rec=0.9923  AUROC=0.7057
SVM                  Acc=0.8917  Prec=0.8940  Rec=0.9967  AUROC=0.4712
XGBoost              Acc=0.8966  Prec=0.9114  Rec=0.9792  AUROC=0.7313

✅ Saved prediction CSVs for all models.


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic MIMIC data ===
df_synthetic = pd.read_csv("generated_data_CLLM_prompt_Mimic.csv")  # Mimic synthetic data

# === Sensitive attribute ===
sensitive_attr = "race"

# === Decode byte strings if needed ===
for col in df_synthetic.columns:
    if df_synthetic[col].dtype == object:
        df_synthetic[col] = df_synthetic[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=["los_seconds"])
y = df_synthetic["los_seconds"].astype(int)
race = df_synthetic[sensitive_attr]

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Convert categorical columns to numeric ===
for df_ in [X_train, X_test]:
    for col in df_.select_dtypes(include=["object", "bool"]).columns:
        df_[col] = pd.factorize(df_[col])[0]

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df["predicted_label"] = preds
    out_df["race"] = race_test.values
    out_df["true_label"] = y_test.values
    out_df.to_csv(f"CLLM_TS_MIMIC_predictions_{name}.csv", index=False)

print("\n✅ Saved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.5576  Prec=0.6369  Rec=0.6400  AUROC=0.5343
Logistic_Regression  Acc=0.6104  Prec=0.6104  Rec=1.0000  AUROC=0.5949
Random_Forest        Acc=0.6543  Prec=0.6717  Rec=0.8480  AUROC=0.6487
SVM                  Acc=0.6113  Prec=0.6109  Rec=1.0000  AUROC=0.4804
XGBoost              Acc=0.6143  Prec=0.6701  Rec=0.7248  AUROC=0.6371

✅ Saved prediction CSVs for all models.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic MIMIC data ===
df_synthetic = pd.read_csv("mimic_synthetic_data_3400_samples_DECAF.csv")  # Decaf synthetic data

# === Sensitive attribute ===
sensitive_attr = "race"

# === Decode byte strings if needed ===
for col in df_synthetic.columns:
    if df_synthetic[col].dtype == object:
        df_synthetic[col] = df_synthetic[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# === Binarize los_seconds as label (>= 345600 seconds => 1, else 0)
df_synthetic["los_seconds"] = pd.to_numeric(df_synthetic["los_seconds"], errors="coerce")
df_synthetic = df_synthetic.dropna(subset=["los_seconds"])
df_synthetic["label"] = (df_synthetic["los_seconds"] >= 345600).astype(int)

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=["los_seconds", "label"])
y = df_synthetic["label"]
race = df_synthetic[sensitive_attr]

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Convert categorical columns to numeric ===
for df_ in [X_train, X_test]:
    for col in df_.select_dtypes(include=["object", "bool"]).columns:
        df_[col] = pd.factorize(df_[col])[0]

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df["predicted_label"] = preds
    out_df["race"] = race_test.values
    out_df["true_label"] = y_test.values
    out_df.to_csv(f"DECAF_TS_MIMIC_predictions_{name}.csv", index=False)

print("\n✅ Saved prediction CSVs for all models.")


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.6422  Prec=0.5166  Rec=0.4160  AUROC=0.5948
Logistic_Regression  Acc=0.6971  Prec=0.6542  Rec=0.3733  AUROC=0.7317
Random_Forest        Acc=0.7176  Prec=0.7458  Rec=0.3520  AUROC=0.7723
SVM                  Acc=0.6324  Prec=0.0000  Rec=0.0000  AUROC=0.5449
XGBoost              Acc=0.6971  Prec=0.6755  Rec=0.3387  AUROC=0.7358

✅ Saved prediction CSVs for all models.


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic MIMIC data ===
df_synthetic = pd.read_csv("Real_MIMIC.csv")  # Real MIMIC data

# === Sensitive attribute ===
sensitive_attr = "race"

# === Decode byte strings if needed ===
for col in df_synthetic.columns:
    if df_synthetic[col].dtype == object:
        df_synthetic[col] = df_synthetic[col].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

# === Binarize los_seconds as label (>= 345600 seconds => 1, else 0)
df_synthetic["los_seconds"] = pd.to_numeric(df_synthetic["los_seconds"], errors="coerce")
df_synthetic = df_synthetic.dropna(subset=["los_seconds"])
df_synthetic["label"] = (df_synthetic["los_seconds"] >= 345600).astype(int)

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=["los_seconds", "label"])
y = df_synthetic["label"]
race = df_synthetic[sensitive_attr]

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Convert categorical columns to numeric ===
for df_ in [X_train, X_test]:
    for col in df_.select_dtypes(include=["object", "bool"]).columns:
        df_[col] = pd.factorize(df_[col])[0]

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df["predicted_label"] = preds
    out_df["race"] = race_test.values
    out_df["true_label"] = y_test.values
    out_df.to_csv(f"Real_MIMIC_predictions_{name}.csv", index=False)

print("\n✅ Saved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.7187  Prec=0.8227  Rec=0.8166  AUROC=0.5911
Logistic_Regression  Acc=0.7896  Prec=0.7974  Rec=0.9802  AUROC=0.7310
Random_Forest        Acc=0.7972  Prec=0.8103  Rec=0.9674  AUROC=0.7563
SVM                  Acc=0.7828  Prec=0.7828  Rec=1.0000  AUROC=0.5057
XGBoost              Acc=0.7789  Prec=0.8135  Rec=0.9311  AUROC=0.7291

✅ Saved prediction CSVs for all models.
