In [4]:
import os
import sys
sys.path.append(os.path.abspath(".."))
import math
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit
from collections import Counter

from config import (
    ARTEMIS_CSV_DIR, ARTEMIS_IMAGES_DIR,
    PROJECT_CSV_DIR, PROJECT_SPLITS_DIR,
    SEED, SUBSET_TARGET, SPLIT_TRAIN, SPLIT_VAL, SPLIT_TEST
)

In [5]:
random.seed(SEED)
np.random.seed(SEED)
 
CSV_PATH = os.path.join(ARTEMIS_CSV_DIR, "artemis_dataset_release_v0.csv")
df = pd.read_csv(CSV_PATH, delimiter=";")


### Esperados
- 'emotion'  -> classe alvo
- 'painting' -> nome do arquivo (ou caminho relativo)
- 'art_style', 'artist', ... (opcionais)
- 'painting_id' -> id único por obra (se inexistente, derivamos de 'painting' sem o sufixo)


### Verifica colunas essenciais:

In [6]:
required_cols = ["emotion", "painting"]
for c in required_cols:
    if c not in df.columns:
        raise KeyError(f"Coluna ausente no CSV: {c}")


### Cria 'painting_id' (grupo por obra) – remove extensão e possíveis variações pós-underscore

In [7]:
def derive_id(p):
    base = os.path.splitext(os.path.basename(str(p)))[0]
    # ex.: "Monet_Impression_Sunrise_0001" -> "Monet_Impression_Sunrise"
    parts = base.split("_")
    if parts and parts[-1].isdigit():
        parts = parts[:-1]
    return "_".join(parts) if parts else base

if "painting_id" not in df.columns:
    df["painting_id"] = df["painting"].apply(derive_id)


In [10]:
# Caminho absoluto para checagem
def abs_path(p):
    return os.path.join(ARTEMIS_IMAGES_DIR, os.path.normpath(str(p)))

df["abs_path"] = df["painting"].apply(abs_path)


In [11]:
# ---------- 2) Limpar classes e contar ----------
df["emotion"] = df["emotion"].astype(str).str.strip()

# Remover classes raríssimas se necessário:
# classes_interesse = ['awe','contentment','sadness','fear','anger','disgust','amusement','excitement']
# df = df[df["emotion"].isin(classes_interesse)]

counts = df["emotion"].value_counts()
print("[info] Distribuição original:")
print(counts)

# ---------- 3) Construir subset balanceado ----------
n_classes = df["emotion"].nunique()
per_class = max(1, SUBSET_TARGET // n_classes)

balanced_frames = []
for emotion, grp in df.groupby("emotion", sort=False):
    # amostra por 'painting_id' primeiro (para evitar muitas legendas da mesma obra)
    # pega 1 registro por obra, p/ diversidade
    grp_sample = grp.drop_duplicates(subset=["painting_id"])
    take = min(len(grp_sample), per_class)
    balanced = grp_sample.sample(n=take, random_state=SEED)
    balanced_frames.append(balanced)

df_bal = pd.concat(balanced_frames, ignore_index=True)

print("\n[info] Subset balanceado (1 por obra):")
print(df_bal["emotion"].value_counts())


[info] Distribuição original:
emotion
contentment       126134
awe                72927
something else     52962
sadness            49061
amusement          45336
fear               41577
excitement         37636
disgust            22411
anger               6640
Name: count, dtype: int64

[info] Subset balanceado (1 por obra):
emotion
fear              2666
awe               2666
amusement         2666
contentment       2666
sadness           2666
disgust           2666
excitement        2666
something else    2666
anger             2666
Name: count, dtype: int64


In [12]:
# ---------- 4) Split por obra (train/val/test) ----------
groups = df_bal["painting_id"].values
gss = GroupShuffleSplit(n_splits=1, train_size=SPLIT_TRAIN, random_state=SEED)
train_idx, rest_idx = next(gss.split(df_bal, groups=groups))

df_train = df_bal.iloc[train_idx]
df_rest  = df_bal.iloc[rest_idx]


### Divide rest entre val e test mantendo proporção


In [13]:

rest_groups = df_rest["painting_id"].values
val_size = SPLIT_VAL / (SPLIT_VAL + SPLIT_TEST)
gss2 = GroupShuffleSplit(n_splits=1, train_size=val_size, random_state=SEED)
val_idx, test_idx = next(gss2.split(df_rest, groups=rest_groups))

df_val  = df_rest.iloc[val_idx]
df_test = df_rest.iloc[test_idx]

In [14]:
assert set(df_train["painting_id"]).isdisjoint(df_val["painting_id"])
assert set(df_train["painting_id"]).isdisjoint(df_test["painting_id"])
assert set(df_val["painting_id"]).isdisjoint(df_test["painting_id"])

print("\n[info] Tamanhos:")
print("train:", len(df_train), "val:", len(df_val), "test:", len(df_test))

print("\n[info] Distribuição por split:")
for name, dfx in [("train", df_train), ("val", df_val), ("test", df_test)]:
    print(f"\n{name}:")
    print(dfx["emotion"].value_counts())



[info] Tamanhos:
train: 19168 val: 2398 test: 2428

[info] Distribuição por split:

train:
emotion
something else    2163
disgust           2155
amusement         2141
excitement        2138
fear              2119
anger             2119
sadness           2114
awe               2113
contentment       2106
Name: count, dtype: int64

val:
emotion
awe               287
contentment       281
fear              278
amusement         273
excitement        265
anger             264
something else    253
sadness           250
disgust           247
Name: count, dtype: int64

test:
emotion
sadness           302
anger             283
contentment       279
fear              269
awe               266
disgust           264
excitement        263
amusement         252
something else    250
Name: count, dtype: int64


### salvando os CSVs e derivados:

In [15]:
train_csv = os.path.join(PROJECT_SPLITS_DIR, "artemis_train.csv")
val_csv   = os.path.join(PROJECT_SPLITS_DIR, "artemis_val.csv")
test_csv  = os.path.join(PROJECT_SPLITS_DIR, "artemis_test.csv")

df_train.to_csv(train_csv, index=False)
df_val.to_csv(val_csv, index=False)
df_test.to_csv(test_csv, index=False)

# Também salvar uma cópia “subset_total.csv”
subset_csv = os.path.join(PROJECT_CSV_DIR, "artemis_subset_balanced.csv")
df_bal.to_csv(subset_csv, index=False)

print(f"\n[ok] Salvo:\n- {train_csv}\n- {val_csv}\n- {test_csv}\n- {subset_csv}")



[ok] Salvo:
- C:\Users\PSETTE\OneDrive - azureford\source\repos\cerebrum-artis\data\splits\artemis_train.csv
- C:\Users\PSETTE\OneDrive - azureford\source\repos\cerebrum-artis\data\splits\artemis_val.csv
- C:\Users\PSETTE\OneDrive - azureford\source\repos\cerebrum-artis\data\splits\artemis_test.csv
- C:\Users\PSETTE\OneDrive - azureford\source\repos\cerebrum-artis\data\csv\artemis_subset_balanced.csv
