# EDA + SHAP — Predição de Câncer de Mama (WDBC)

Este notebook gera os graficos SHAP e salva as figuras em `reports/figures/`.


## A) Setup completo (dados, RF e `explainer`)

In [1]:
# === SETUP COMPLETO: dados, split, RF e explainer ===
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Descobrir raiz do projeto e localizar data/data.csv
NB_DIR = os.getcwd()
CANDIDATES = [
    os.path.abspath(os.path.join(NB_DIR, "..")),  # se o notebook está em notebooks/
    NB_DIR,                                        # se está na raiz
]
csv_path = None
for root in CANDIDATES:
    candidate = os.path.join(root, "data", "data.csv")
    if os.path.exists(candidate):
        csv_path = candidate
        PROJECT_ROOT = root
        break
if csv_path is None:
    raise FileNotFoundError("Não encontrei data/data.csv. Ajuste o caminho para o seu CSV.")

# Pastas de saída
REPORTS = os.path.join(PROJECT_ROOT, "reports")
FIG = os.path.join(REPORTS, "figures")
os.makedirs(FIG, exist_ok=True)

# Carregar e preparar X/y
df = pd.read_csv(csv_path)
for col in ["id", "Unnamed: 32"]:
    if col in df.columns:
        df = df.drop(columns=[col])
if "diagnosis" not in df.columns:
    raise ValueError("CSV precisa conter a coluna 'diagnosis' ('M'/'B').")

y = df["diagnosis"].map({"M":1,"B":0}).astype(int)
X = df.drop(columns=["diagnosis"])

display(df.head())
display(df.describe().T)
print("Distribuição do alvo (0=benigno, 1=maligno):")
print(y.value_counts(normalize=True))

# Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Treino rápido de RandomForest (robusto)
rf = RandomForestClassifier(
    n_estimators=600, class_weight='balanced',
    random_state=42
)
rf.fit(X_train, y_train)

# Criar o explainer de modo robusto a versões do shap
try:
    explainer = shap.Explainer(rf)         # API nova (retorna Explanation)
except Exception:
    explainer = shap.TreeExplainer(rf)     # fallback para API legada

# Amostra para acelerar os gráficos
X_test_sample = X_test.sample(n=min(100, len(X_test)), random_state=42)

print("Setup ok. Variáveis disponíveis: PROJECT_ROOT, FIG, X_test, X_test_sample, rf, explainer.")


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
fractal_dimension_mean,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


Distribuição do alvo (0=benigno, 1=maligno):
diagnosis
0    0.627417
1    0.372583
Name: proportion, dtype: float64
Setup ok. Variáveis disponíveis: PROJECT_ROOT, FIG, X_test, X_test_sample, rf, explainer.


## B) SHAP (beeswarm e bar) — robusto a versões

In [2]:
# === SHAP: beeswarm e bar, compatível com várias versões do shap ===
import os, matplotlib.pyplot as plt

try:
    # Tenta a API nova: passar X diretamente ao explainer retorna um shap.Explanation
    exp = explainer(X_test_sample)

    plt.figure()
    shap.plots.beeswarm(exp, show=False)    # usa matplotlib
    plt.tight_layout()
    plt.savefig(os.path.join(FIG, "shap_summary_beeswarm.png"),
                dpi=120, bbox_inches="tight")
    plt.close()

    plt.figure()
    shap.plots.bar(exp, show=False)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG, "shap_summary_bar.png"),
                dpi=120, bbox_inches="tight")
    plt.close()

    print("Gerado: shap_summary_beeswarm.png e shap_summary_bar.png (API nova)")

except Exception as e:
    # Fallback para API legada (shap_values([...]))
    print("API nova falhou, usando fallback legado:", repr(e))
    sv_raw = explainer.shap_values(X_test_sample)
    sv = sv_raw[1] if isinstance(sv_raw, list) else sv_raw  # classe positiva

    # Ajuste se vier com coluna extra (offset)
    if sv.shape[1] == X_test_sample.shape[1] + 1:
        sv = sv[:, :-1]

    plt.figure()
    shap.summary_plot(sv, X_test_sample, show=False)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG, "shap_summary_beeswarm.png"),
                dpi=120, bbox_inches="tight")
    plt.close()

    plt.figure()
    shap.summary_plot(sv, X_test_sample, plot_type="bar", show=False)
    plt.tight_layout()
    plt.savefig(os.path.join(FIG, "shap_summary_bar.png"),
                dpi=120, bbox_inches="tight")
    plt.close()

    print("Gerado: shap_summary_beeswarm.png e shap_summary_bar.png (fallback legado)")


API nova falhou, usando fallback legado: ValueError('The beeswarm plot does not support plotting explanations with instances that have more than one dimension!')
Gerado: shap_summary_beeswarm.png e shap_summary_bar.png (fallback legado)


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>