In [3]:
!pip install iterative-stratification



In [7]:
# ===============================
# Notebook 03 — Pré-processamento (Plant Pathology)
# ===============================

# 0) Imports
import os, random
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 1) Caminhos e hiperparâmetros
DATA_DIR   = Path("../data")         # ajuste se necessário
TRAIN_CSV  = DATA_DIR/"train.csv"
TRAIN_DIR  = DATA_DIR/"train_images"
TEST_DIR   = DATA_DIR/"test_images"  # (amostras; o teste real é oculto no Kaggle)

IMG_SIZE   = (224, 224)  # bom ponto de partida p/ ResNet/EfficientNet
BATCH_SIZE = 32
SEED       = 42

assert TRAIN_CSV.exists(), f"train.csv não encontrado em {TRAIN_CSV}"
assert TRAIN_DIR.exists(), f"train_images/ não encontrado em {TRAIN_DIR}"

# 2) Carregar labels e montar vetor multi-hot
df = pd.read_csv(TRAIN_CSV)
df.columns = [c.strip().lower() for c in df.columns]  # garante 'image' e 'labels'
assert {"image", "labels"} <= set(df.columns), "train.csv precisa conter colunas 'image' e 'labels'"

df["labels_list"] = df["labels"].astype(str).str.strip().str.split()
all_labels = sorted({lab for labs in df["labels_list"] for lab in labs})
label2idx = {lab:i for i, lab in enumerate(all_labels)}

def encode_multi_hot(labs):
    v = np.zeros(len(all_labels), dtype=np.float32)
    for lab in labs:
        if lab in label2idx:
            v[label2idx[lab]] = 1.0
    return v

df["y"] = df["labels_list"].apply(encode_multi_hot)
Y = np.stack(df["y"].values)

print("Classes:", all_labels, f"(total={len(all_labels)})")
print("Total de imagens:", len(df))

# 3) Split treino/val/teste
# ------------------------------------------------------------
# Preferível: estratificação multi-label (Iterative Stratification).
# Instale no Colab se quiser usar: !pip install iterative-stratification
# Senão, caímos num fallback com estratificação aproximada pelo "primeiro rótulo".
use_iterative = False
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    use_iterative = True
except Exception as e:
    use_iterative = False
    print("Aviso: 'iterative-stratification' não disponível. Usando fallback de estratificação simples.")

df["primary_label"] = df["labels_list"].apply(lambda labs: labs[0] if len(labs)>0 else "none")

if use_iterative:
    # 70% train, 15% val, 15% test via duas passagens de stratified shuffle split
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=SEED)
    train_idx, temp_idx = next(msss.split(df["image"], Y))
    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_temp  = df.iloc[temp_idx].reset_index(drop=True)
    Y_temp   = Y[temp_idx]

    msss2 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=SEED)
    val_idx, test_idx = next(msss2.split(df_temp["image"], Y_temp))
    df_val  = df_temp.iloc[val_idx].reset_index(drop=True)
    df_test = df_temp.iloc[test_idx].reset_index(drop=True)

else:
    # Fallback: estratificar por rótulo primário (aproxima a distribuição)
    df_train, df_temp = train_test_split(
        df, test_size=0.30, random_state=SEED, stratify=df["primary_label"]
    )
    df_val, df_test = train_test_split(
        df_temp, test_size=0.50, random_state=SEED, stratify=df_temp["primary_label"]
    )
    df_train = df_train.reset_index(drop=True)
    df_val   = df_val.reset_index(drop=True)
    df_test  = df_test.reset_index(drop=True)

print(f"Tamanhos → train: {len(df_train)}, val: {len(df_val)}, test: {len(df_test)}")

# 4) Data augmentation (treino) e normalização (todas)
# ------------------------------------------------------------
# Observação: como é multi-label, vamos usar class_mode='raw' e passar Y (vetor multi-hot).
train_aug = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.08,
    height_shift_range=0.08,
    zoom_range=0.12,
    horizontal_flip=True,
    vertical_flip=False,
    fill_mode="nearest",
)

valid_aug = ImageDataGenerator(rescale=1./255)
test_aug  = ImageDataGenerator(rescale=1./255)

# 5) Geradores Keras com y em formato 2D (uma coluna por classe)
def make_df_for_flow(df_part, labels):
    df_local = df_part[["image"]].copy()
    df_local["filepath"] = df_local["image"].apply(lambda n: str(TRAIN_DIR / n))
    # cria colunas binárias y_<classe> = 0/1
    for lab in labels:
        df_local[f"y_{lab}"] = df_part["y"].apply(lambda v: float(v[label2idx[lab]]))
    return df_local

label_cols = [f"y_{lab}" for lab in all_labels]

df_train_flow = make_df_for_flow(df_train, all_labels)
df_val_flow   = make_df_for_flow(df_val,   all_labels)
df_test_flow  = make_df_for_flow(df_test,  all_labels)

train_flow = train_aug.flow_from_dataframe(
    dataframe=df_train_flow,
    x_col="filepath",
    y_col=label_cols,          # << várias colunas = matriz (batch, n_classes)
    target_size=IMG_SIZE,
    color_mode="rgb",
    class_mode="raw",
    batch_size=BATCH_SIZE,
    shuffle=True,
    seed=SEED,
)

val_flow = valid_aug.flow_from_dataframe(
    dataframe=df_val_flow,
    x_col="filepath",
    y_col=label_cols,
    target_size=IMG_SIZE,
    color_mode="rgb",
    class_mode="raw",
    batch_size=BATCH_SIZE,
    shuffle=False,
)

test_flow = test_aug.flow_from_dataframe(
    dataframe=df_test_flow,
    x_col="filepath",
    y_col=label_cols,
    target_size=IMG_SIZE,
    color_mode="rgb",
    class_mode="raw",
    batch_size=BATCH_SIZE,
    shuffle=False,
)

steps_train = int(np.ceil(len(df_train_flow) / BATCH_SIZE))
steps_val   = int(np.ceil(len(df_val_flow)   / BATCH_SIZE))
steps_test  = int(np.ceil(len(df_test_flow)  / BATCH_SIZE))

xb, yb = next(iter(train_flow))
print("Batch shapes → X:", xb.shape, "| y:", yb.shape, "| dtype y:", yb.dtype)
# esperado: y -> (32, 6) float32



Classes: ['complex', 'frog_eye_leaf_spot', 'healthy', 'powdery_mildew', 'rust', 'scab'] (total=6)
Total de imagens: 18632
Tamanhos → train: 13063, val: 2773, test: 2796
Found 13063 validated image filenames.
Found 2773 validated image filenames.
Found 2796 validated image filenames.
Batch shapes → X: (32, 224, 224, 3) | y: (32, 6) | dtype y: float64
