# Avaliação de Dados Sintéticos vs Reais com AUROC (C2ST)
Conjunto de dados: `Palmer Penguins` (Pinguins de Palmer) <br>

-Autoras: <br>
Ana Luíza Gomes Vieira (analuizagv2000@gmail.com) <br>
Sarah Vitória Moreira de Aquino (sarahvitoriaaquino@gmail.com)<br>

-Data: <br>
Agosto de 2025

-Feito no VSCode

---

A metodologia aplicada segue a abordagem descrita no artigo da Inflammatix:

> **Lessons learned for generative AI for tabular data**  
> Kirindi Choi, Ljubomir Buturovic, Roland Luethy — Inflammatix, Inc.  
> Disponível em: https://inflammatix.com/lessons-learned-for-generative-ai-for-tabular-data-blog/

---

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import optuna
import tkinter as tk
from tkinter import filedialog
import os

RANDOM_STATE = 42

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
 # Caminho manual, sem seletor
caminho_arquivo = r"C:\Users\analu\Documents\TESTES\versao_final\dados_sinteticos.json"
penguins_sintetico = pd.read_json(caminho_arquivo)

In [3]:
penguins_real = pd.read_csv('https://raw.githubusercontent.com/ftorresd/OCEAN-datasets/main/penguins/penguins.csv')

In [4]:
# use apenas colunas em comum (remova alvos, se existirem)
forbidden = {"class", "target", "label"}
common_cols = sorted(set(penguins_real.columns) & set(penguins_sintetico.columns) - forbidden)

X_real = penguins_real[common_cols].copy()
X_syn  = penguins_sintetico[common_cols].copy()

y_real = np.zeros(len(X_real), dtype=int)  # classe negativa
y_syn  = np.ones(len(X_syn), dtype=int)    # classe positiva (como no artigo)

# concatenar e embaralhar
X = pd.concat([X_real, X_syn], ignore_index=True)
y = np.concatenate([y_real, y_syn])

rng = np.random.RandomState(RANDOM_STATE)
idx = rng.permutation(len(X))
X = X.iloc[idx].reset_index(drop=True)
y = y[idx]


In [5]:
def split_cols_by_types(X, metadados=None):
    if metadados is not None and set(["name","sdtype"]).issubset(metadados.columns):
        # mapeia os sdtypes da SDV para num/cat
        num_types = {"numerical"}
        cat_types = {"categorical", "boolean"}
        cat_cols = [row["name"] for _, row in metadados.iterrows() if row["sdtype"] in cat_types and row["name"] in X.columns]
        num_cols = [row["name"] for _, row in metadados.iterrows() if row["sdtype"] in num_types and row["name"] in X.columns]
        # qualquer coluna faltante cai por dtype:
        remaining = [c for c in X.columns if c not in cat_cols + num_cols]
        num_cols += [c for c in remaining if np.issubdtype(X[c].dtype, np.number)]
        cat_cols += [c for c in remaining if c not in num_cols]
    else:
        num_cols = X.select_dtypes(include=np.number).columns.tolist()
        cat_cols = [c for c in X.columns if c not in num_cols]
    return num_cols, cat_cols

num_cols, cat_cols = split_cols_by_types(X, metadados if 'metadados' in globals() else None)

# pré-processador
numeric = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc",  StandardScaler(with_mean=False)),
])

# compatível com versões antigas/novas do sklearn
try:
    categorical = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
except TypeError:
    categorical = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh",  OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ])

preprocess = ColumnTransformer([
    ("num", numeric, num_cols),
    ("cat", categorical, cat_cols),
], remainder="drop")


In [6]:
def objective(trial):
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    clf = LogisticRegression(
        penalty=penalty, C=C, class_weight=class_weight,
        solver="liblinear", random_state=42, max_iter=2000
    )

    pipe = Pipeline([("prep", preprocess), ("clf", clf)])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc", n_jobs=1)
    return float(scores.mean())

In [7]:
study = optuna.create_study(direction="maximize", study_name="c2st_auroc_penguins")
study.optimize(objective, n_trials=50, show_progress_bar=False)

print("== RESULTADO ==")
print("Melhor AUROC (CV):", round(study.best_value, 4))
print("Melhores hiperparâmetros:", study.best_params)

[I 2025-08-21 12:15:21,116] A new study created in memory with name: c2st_auroc_penguins
[I 2025-08-21 12:15:21,635] Trial 0 finished with value: 0.5499682011935209 and parameters: {'penalty': 'l2', 'C': 0.0074239343445994055, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5499682011935209.
[I 2025-08-21 12:15:25,345] Trial 1 finished with value: 0.5352636615515772 and parameters: {'penalty': 'l1', 'C': 742.2250787020959, 'class_weight': None}. Best is trial 0 with value: 0.5499682011935209.
[I 2025-08-21 12:15:27,822] Trial 2 finished with value: 0.545517156862745 and parameters: {'penalty': 'l1', 'C': 0.232232905275884, 'class_weight': None}. Best is trial 0 with value: 0.5499682011935209.
[I 2025-08-21 12:17:17,750] Trial 3 finished with value: 0.5475939258312021 and parameters: {'penalty': 'l1', 'C': 161.3861533419216, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5499682011935209.
[I 2025-08-21 12:17:23,563] Trial 4 finished with value: 0.5356034953111679 

== RESULTADO ==
Melhor AUROC (CV): 0.5511
Melhores hiperparâmetros: {'penalty': 'l2', 'C': 0.026166839847692115, 'class_weight': 'balanced'}


In [8]:
best = study.best_params
best_clf = LogisticRegression(
    penalty=best["penalty"],
    C=best["C"],
    class_weight=best["class_weight"],
    solver="liblinear",
    random_state=RANDOM_STATE,
    max_iter=2000,
)

pipe = Pipeline([("prep", preprocess), ("clf", best_clf)])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
folds = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc", n_jobs=1)
print("AUROC por dobra:", np.round(folds, 4))
print("Média ± DP:", round(folds.mean(),4), "±", round(folds.std(),4))


AUROC por dobra: [0.536  0.5098 0.5479 0.5582 0.6036]
Média ± DP: 0.5511 ± 0.0308
