In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic COMPAS data ===
df_synthetic = pd.read_csv("generated_data_Our_prompt_COMPAS.csv")  # Our Synthetic Data for COMPAS

# === Convert categorical/binary features to int ===
categorical_columns = [
    "sex",
    "age_cat_25-45", "age_cat_Greaterthan45", "age_cat_Lessthan25",
    "race_African-American", "race_Caucasian",
    "c_charge_degree_F", "c_charge_degree_M"
]
for col in categorical_columns:
    if col in df_synthetic.columns:
        df_synthetic[col] = pd.to_numeric(df_synthetic[col], errors="coerce")

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Define label and sensitive attribute ===
label_col = "two_year_recid"
sensitive_attr = "race_African-American"  # Privileged group is value 1

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=[label_col])
y = df_synthetic[label_col]
race = df_synthetic[sensitive_attr]  # Already binary (1 = privileged, 0 = unprivileged)

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df[label_col] = preds
    out_df["race_African-American"] = race_test.values
    out_df.to_csv(f"Our_TS_COMPAS_predictions_{name}.csv", index=False)

print("\nSaved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.7367  Prec=0.7810  Rec=0.6859  AUROC=0.7452
Logistic_Regression  Acc=0.8300  Prec=0.8302  Rec=0.8462  AUROC=0.9192
Random_Forest        Acc=0.8300  Prec=0.8182  Rec=0.8654  AUROC=0.9198
SVM                  Acc=0.8533  Prec=0.8333  Rec=0.8974  AUROC=0.9184
XGBoost              Acc=0.8167  Prec=0.8258  Rec=0.8205  AUROC=0.9159

Saved prediction CSVs for all models.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic COMPAS data ===
df_synthetic = pd.read_csv("generated_data_CLLM_prompt_COMPAS.csv")  # CLLM Synthetic Data for COMPAS

# === Convert categorical/binary features to int ===
categorical_columns = [
    "sex",
    "age_cat_25-45", "age_cat_Greaterthan45", "age_cat_Lessthan25",
    "race_African-American", "race_Caucasian",
    "c_charge_degree_F", "c_charge_degree_M"
]
for col in categorical_columns:
    if col in df_synthetic.columns:
        df_synthetic[col] = pd.to_numeric(df_synthetic[col], errors="coerce")

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Define label and sensitive attribute ===
label_col = "two_year_recid"
sensitive_attr = "race_African-American"  # Privileged group is value 1

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=[label_col])
y = df_synthetic[label_col]
race = df_synthetic[sensitive_attr]  # Already binary (1 = privileged, 0 = unprivileged)

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df[label_col] = preds
    out_df["race_African-American"] = race_test.values
    out_df.to_csv(f"CLLM_TS_COMPAS_predictions_{name}.csv", index=False)

print("\nSaved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.8933  Prec=0.9051  Rec=0.8938  AUROC=0.9005
Logistic_Regression  Acc=0.8733  Prec=0.8588  Rec=0.9125  AUROC=0.9395
Random_Forest        Acc=0.8600  Prec=0.8430  Rec=0.9062  AUROC=0.9535
SVM                  Acc=0.7967  Prec=0.7861  Rec=0.8500  AUROC=0.9085
XGBoost              Acc=0.8867  Prec=0.8889  Rec=0.9000  AUROC=0.9624

Saved prediction CSVs for all models.


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic COMPAS data ===
df_synthetic = pd.read_csv("compas_synthetic_data_1000_200_epochs.csv")  # DECAF Synthetic Data for COMPAS

# === Convert categorical/binary features to int ===
categorical_columns = [
    "sex",
    "age_cat_25-45", "age_cat_Greaterthan45", "age_cat_Lessthan25",
    "race_African-American", "race_Caucasian",
    "c_charge_degree_F", "c_charge_degree_M"
]
for col in categorical_columns:
    if col in df_synthetic.columns:
        df_synthetic[col] = pd.to_numeric(df_synthetic[col], errors="coerce")

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Define label and sensitive attribute ===
label_col = "two_year_recid"
sensitive_attr = "race_African-American"  # Privileged group is value 1

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=[label_col])
y = df_synthetic[label_col]
race = df_synthetic[sensitive_attr]  # Already binary (1 = privileged, 0 = unprivileged)

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df[label_col] = preds
    out_df["race_African-American"] = race_test.values
    out_df.to_csv(f"DECAF_TS_COMPAS_predictions_{name}.csv", index=False)

print("\nSaved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.5333  Prec=0.5714  Rec=0.5432  AUROC=0.5325
Logistic_Regression  Acc=0.6267  Prec=0.6645  Rec=0.6235  AUROC=0.6354
Random_Forest        Acc=0.5533  Prec=0.5897  Rec=0.5679  AUROC=0.5622
SVM                  Acc=0.5600  Prec=0.6500  Rec=0.4012  AUROC=0.6351
XGBoost              Acc=0.5733  Prec=0.6181  Rec=0.5494  AUROC=0.5826

Saved prediction CSVs for all models.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# === Load synthetic COMPAS data ===
df_synthetic = pd.read_csv("compas_cleaned.csv")  # Real Data of COMPAS

# === Convert categorical/binary features to int ===
categorical_columns = [
    "sex",
    "age_cat_25-45", "age_cat_Greaterthan45", "age_cat_Lessthan25",
    "race_African-American", "race_Caucasian",
    "c_charge_degree_F", "c_charge_degree_M"
]
for col in categorical_columns:
    if col in df_synthetic.columns:
        df_synthetic[col] = pd.to_numeric(df_synthetic[col], errors="coerce")

# === Drop missing values ===
df_synthetic = df_synthetic.dropna()

# === Define label and sensitive attribute ===
label_col = "two_year_recid"
sensitive_attr = "race_African-American"  # Privileged group is value 1

# === Train-test split (70% train, 30% test) ===
X = df_synthetic.drop(columns=[label_col])
y = df_synthetic[label_col]
race = df_synthetic[sensitive_attr]  # Already binary (1 = privileged, 0 = unprivileged)

X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X, y, race, test_size=0.3, random_state=42, stratify=y
)

# === Define models ===
models = {
    "Decision_Tree": DecisionTreeClassifier(random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Random_Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# === Train, predict, evaluate ===
print("\nModel Performance (Accuracy, Precision, Recall, AUROC):")
print("---------------------------------------------------------")
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        proba = model.decision_function(X_test)
    else:
        proba = preds

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    auroc = roc_auc_score(y_test, proba)

    print(f"{name:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  AUROC={auroc:.4f}")

    # Save predictions
    out_df = X_test.copy()
    out_df[label_col] = preds
    out_df["race_African-American"] = race_test.values
    out_df.to_csv(f"COMPAS_predictions_{name}.csv", index=False)

print("\nSaved prediction CSVs for all models.")



Model Performance (Accuracy, Precision, Recall, AUROC):
---------------------------------------------------------
Decision_Tree        Acc=0.6149  Prec=0.6066  Rec=0.5154  AUROC=0.6256
Logistic_Regression  Acc=0.6648  Prec=0.6631  Rec=0.5839  AUROC=0.7172
Random_Forest        Acc=0.6218  Prec=0.6046  Rec=0.5664  AUROC=0.6612
SVM                  Acc=0.6673  Prec=0.6677  Rec=0.5826  AUROC=0.7176
XGBoost              Acc=0.6553  Prec=0.6519  Rec=0.5732  AUROC=0.6997

Saved prediction CSVs for all models.
