In [3]:
import pandas as pd
from sdmetrics.visualization import get_column_plot, get_column_pair_plot

# === Load data ===
real_data = pd.read_csv("Real_MIMIC.csv")
synthetic_data = pd.read_csv("generated_data_Our_prompts_MIMIC.csv")

# === Binarize the label in real data
real_data = real_data.copy()
real_data["label"] = (real_data["los_seconds"] >= 345600).astype(int)

# === Optional: convert sensitive features to string (for bar plots)
for col in ['race', 'gender']:
    if col in real_data.columns:
        real_data[col] = real_data[col].astype(str)
    if col in synthetic_data.columns:
        synthetic_data[col] = synthetic_data[col].astype(str)

# === Print bar values for race ===
print("📊 Race distribution (%):")
print("Real Data:")
print(real_data["race"].value_counts(normalize=True).mul(100).round(2))
print("\nSynthetic Data:")
print(synthetic_data["race"].value_counts(normalize=True).mul(100).round(2))


📊 Race distribution (%):
Real Data:
race
WHITE                     81.77
BLACK/AFRICAN AMERICAN    18.23
Name: proportion, dtype: float64

Synthetic Data:
race
WHITE                     51.02
BLACK/AFRICAN AMERICAN    48.98
Name: proportion, dtype: float64


In [4]:
import pandas as pd
from sdmetrics.visualization import get_column_plot, get_column_pair_plot

# === Load data ===
real_data = pd.read_csv("Real_MIMIC.csv")
synthetic_data = pd.read_csv("generated_data_Our_prompts_MIMIC.csv")

# === Binarize real los_seconds into label
real_data = real_data.copy()
real_data["label"] = (real_data["los_seconds"] >= 345600).astype(int)

# === Ensure 'gender' column is string for bar plots
real_data["gender"] = real_data["gender"].astype(str)
synthetic_data["gender"] = synthetic_data["gender"].astype(str)

# === Print gender distribution (%)
print("📊 Gender distribution (%):")
print("Real Data:")
print(real_data["gender"].value_counts(normalize=True).mul(100).round(2))
print("\nSynthetic Data:")
print(synthetic_data["gender"].value_counts(normalize=True).mul(100).round(2))


📊 Gender distribution (%):
Real Data:
gender
F    51.67
M    48.33
Name: proportion, dtype: float64

Synthetic Data:
gender
F    50.38
M    49.62
Name: proportion, dtype: float64


In [5]:
import pandas as pd
from sdmetrics.visualization import get_column_plot

# === Load datasets
real_data = pd.read_csv("compas_cleaned.csv")
synthetic_data = pd.read_csv("generated_data_Our_prompt_COMPAS.csv")

# === Create unified 'race' column
for df in [real_data, synthetic_data]:
    df["race"] = df["race_African-American"].apply(lambda x: "African-American" if x == 1 else "Caucasian")
    df["race"] = df["race"].astype(str)

# === Print bar values
def print_bar_values(df, label):
    counts = df["race"].value_counts().sort_index()
    percentages = round(df["race"].value_counts(normalize=True).sort_index() * 100, 2)
    print(f"\n🔍 {label} Race Distribution:")
    for race in counts.index:
        print(f"  {race}: {counts[race]} samples ({percentages[race]}%)")

print_bar_values(real_data, "Real Data")
print_bar_values(synthetic_data, "Synthetic Data")



🔍 Real Data Race Distribution:
  African-American: 3175 samples (60.16%)
  Caucasian: 2103 samples (39.84%)

🔍 Synthetic Data Race Distribution:
  African-American: 539 samples (53.9%)
  Caucasian: 461 samples (46.1%)


In [6]:
import pandas as pd
from sdmetrics.visualization import get_column_plot

# === Load datasets
real_data = pd.read_csv("compas_cleaned.csv")
synthetic_data = pd.read_csv("generated_data_Our_prompt_COMPAS.csv")

# === Convert 'sex' to readable string (if numeric)
real_data["sex"] = real_data["sex"].map({1: "Male", 2: "Female"}).astype(str)
synthetic_data["sex"] = synthetic_data["sex"].map({1: "Male", 2: "Female"}).astype(str)

# === Print bar values
def print_bar_values(df, label, column):
    counts = df[column].value_counts().sort_index()
    percentages = round(df[column].value_counts(normalize=True).sort_index() * 100, 2)
    print(f"\n🔍 {label} {column.capitalize()} Distribution:")
    for cat in counts.index:
        print(f"  {cat}: {counts[cat]} samples ({percentages[cat]}%)")

print_bar_values(real_data, "Real Data", "sex")
print_bar_values(synthetic_data, "Synthetic Data", "sex")



🔍 Real Data Sex Distribution:
  Male: 4247 samples (80.47%)
  nan: 1031 samples (19.53%)

🔍 Synthetic Data Sex Distribution:
  Male: 623 samples (62.3%)
  nan: 377 samples (37.7%)


In [7]:
import pandas as pd
from sdmetrics.visualization import get_column_plot

# === Load datasets
real_data = pd.read_csv("bar_pass_prediction (processed version).csv")
synthetic_data = pd.read_csv("generated_data_Our_prompts_Law.csv")

# === Mapping race codes to labels
race_labels = {
    1: "American Indian / Alaska Native",
    2: "Asian",
    3: "Black / African American",
    4: "Hispanic / Latino",
    5: "Native Hawaiian / Pacific Islander",
    6: "Other",
    7: "White",
    8: "Two or more races"
}

# === Map race codes to string labels
real_data["race"] = real_data["race"].map(race_labels).fillna("Unknown")
synthetic_data["race"] = synthetic_data["race"].map(race_labels).fillna("Unknown")

# === Print value counts and percentages
def print_bar_values(df, label, column):
    print(f"\n🔍 {label} — {column.capitalize()} Distribution:")
    counts = df[column].value_counts().sort_index()
    percentages = df[column].value_counts(normalize=True).sort_index() * 100
    for value in counts.index:
        print(f" {value}: {counts[value]} samples ({percentages[value]:.2f}%)")

print_bar_values(real_data, "Real Data", "race")
print_bar_values(synthetic_data, "Synthetic Data", "race")





🔍 Real Data — Race Distribution:
 American Indian / Alaska Native: 98 samples (0.46%)
 Asian: 839 samples (3.94%)
 Black / African American: 1278 samples (6.00%)
 Hispanic / Latino: 387 samples (1.82%)
 Native Hawaiian / Pacific Islander: 109 samples (0.51%)
 Other: 483 samples (2.27%)
 Two or more races: 287 samples (1.35%)
 White: 17830 samples (83.67%)

🔍 Synthetic Data — Race Distribution:
 American Indian / Alaska Native: 253 samples (12.46%)
 Asian: 256 samples (12.61%)
 Black / African American: 257 samples (12.66%)
 Hispanic / Latino: 255 samples (12.56%)
 Native Hawaiian / Pacific Islander: 261 samples (12.86%)
 Other: 253 samples (12.46%)
 Two or more races: 250 samples (12.32%)
 White: 245 samples (12.07%)


In [10]:
import pandas as pd
from sdmetrics.visualization import get_column_plot

# === Load data ===
real_data = pd.read_csv("bar_pass_prediction (processed version).csv")
synthetic_data = pd.read_csv("generated_data_Our_prompts_Law.csv")

# === Convert 'sex' column to string for plotting
real_data["sex"] = real_data["sex"].astype(str)
synthetic_data["sex"] = synthetic_data["sex"].astype(str)

# === Show bar values
real_counts = real_data["sex"].value_counts(normalize=True).sort_index() * 100
synth_counts = synthetic_data["sex"].value_counts(normalize=True).sort_index() * 100

print("📊 Sex Distribution (%):")
for val in sorted(set(real_data["sex"].unique()) | set(synthetic_data["sex"].unique())):
    r = real_counts.get(val, 0)
    s = synth_counts.get(val, 0)
    print(f"Sex = {val} → Real: {r:.2f}%, Synthetic: {s:.2f}%")



📊 Sex Distribution (%):
Sex = 1 → Real: 43.63%, Synthetic: 50.39%
Sex = 2 → Real: 56.37%, Synthetic: 49.61%
