In [1]:
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
import os

# ==============================================================================
# 1. CARGA DE CONFIGURA√á√ÉO
# ==============================================================================

def find_project_root(anchor_file="conf/config.yaml"):
    """
    Sobe os diret√≥rios a partir do notebook atual at√© encontrar
    a pasta onde 'conf/config.yaml' existe.
    """
    current_path = Path.cwd()
    
    # Tenta no diret√≥rio atual e sobe at√© o raiz do sistema
    for parent in [current_path] + list(current_path.parents):
        potential_config = parent / anchor_file
        if potential_config.exists():
            return parent
            
    raise FileNotFoundError(f"N√£o foi poss√≠vel encontrar a raiz do projeto contendo '{anchor_file}'.")

# 1. Definir BASE_DIR (Raiz do Projeto)
try:
    BASE_DIR = find_project_root("conf/config.yaml")
    print(f"üìÇ Raiz do Projeto encontrada: {BASE_DIR}")
except FileNotFoundError as e:
    # Fallback manual caso a busca autom√°tica falhe (ajuste se necess√°rio)
    print("Busca autom√°tica falhou. Usando fallback.")
    BASE_DIR = Path("/Users/lucasborges/Downloads/TCC")

# 2. Carregar o YAML da pasta conf
CONFIG_PATH = BASE_DIR / "conf/config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# ==============================================================================
# 2. ATALHOS E VARI√ÅVEIS GLOBAIS
# ==============================================================================

# Atalhos dos Dicion√°rios do YAML
# P['raw'] vai virar algo como: /Users/.../TCC/data/raw
P = {k: BASE_DIR / v for k, v in config['paths'].items()} # P de Paths
F = config['files']                                       # F de Files
PM = config['params']                                     # PM de Params

print(f"‚öôÔ∏è Configura√ß√£o carregada de: {CONFIG_PATH}")

# ==============================================================================
# 3. PONTE DE VARI√ÅVEIS
# ==============================================================================

# Caminhos de Arquivos (Apontando para o YAML)
TRAIN_EMB_PATH       = P['processed'] / F['track_embeddings']
NEW_EMB_PATH         = P['processed'] / F['new_track_embeddings']
X_TRAIN_PATH         = P['processed'] / F['train_features']
X_TEST_PATH          = P['processed'] / F['test_features']

# Ajuste conforme onde voc√™ salvou o df_tracks_complete (interim ou processed?)
# Se n√£o estiver no YAML, usa o caminho constru√≠do:
TRACKS_COMPLETE_PATH = P['interim']   / "df_tracks_complete_v5.parquet"

# Caminhos de Grafos
# Verifica se as chaves existem no yaml, sen√£o usa padr√£o
MATCHING_MAP_PATH    = P.get('graphs_coarsened', P['graphs_bipartite']) / F['matching_map']
SUPER_EMB_PATH       = P.get('graphs_super', P['graphs_bipartite'])     / F['super_embeddings']

# Par√¢metros
SEED                 = PM['seed']

# Configura√ß√µes Visuais Padr√£o
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300

üìÇ Raiz do Projeto encontrada: /Users/lucasborges/Downloads/TCC
‚öôÔ∏è Configura√ß√£o carregada de: /Users/lucasborges/Downloads/TCC/conf/config.yaml


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F_nn  # <--- CORRE√á√ÉO: Mudamos de F para F_nn
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.transforms import RandomLinkSplit
from scipy.sparse import load_npz
from pathlib import Path
import gc

# --- CONFIGURA√á√ÉO DE CAMINHOS DIN√ÇMICOS ---
# Usamos P (Paths) e F (Files) da C√©lula Mestra
# Certifique-se de que P e F est√£o definidos (rode a C√©lula Mestra antes se necess√°rio)

print("Mapeando arquivos para o GraphSAGE...")

# 1. INPUTS
# Features de Treino
X_TRAIN_PATH = P['processed'] / F['train_features']  # Agora F √© o dicion√°rio correto

# Grafo de Super-M√∫sicas
SUPER_ADJ_PATH   = P['graphs_super'] / "A_super_tracks_adjacency.npz"
SUPER_INDEX_PATH = P['graphs_super'] / "super_m_index.parquet"

# Mapa de Matching
MATCHING_MAP_PATH = P['graphs_coarsened'] / F['matching_map']

# 2. OUTPUTS
OUT_SUPER_EMB = P['graphs_super'] / F['super_embeddings']
OUT_TRACK_EMB = P['processed'] / F['track_embeddings']

# 3. VALIDA√á√ÉO
missing = []
for p in [X_TRAIN_PATH, SUPER_ADJ_PATH, SUPER_INDEX_PATH, MATCHING_MAP_PATH]:
    if not p.exists():
        missing.append(p.name)

if missing:
    print(f"ERRO: Arquivos de entrada n√£o encontrados: {missing}")
else:
    print("Todos os arquivos de entrada encontrados.")
    print(f"   Output Super: {OUT_SUPER_EMB.name}")

Mapeando arquivos para o GraphSAGE...
Todos os arquivos de entrada encontrados.
   Output Super: super_embeddings_mean.parquet


In [3]:
def build_supergraph_data(features_path, super_adj_path, super_index_path, matching_map_path):
    print("--- 1. Carregando Estrutura do Grafo ---")
    super_m_index = pd.read_parquet(super_index_path).squeeze()
    if isinstance(super_m_index, pd.DataFrame): super_m_index = super_m_index.iloc[:, 0]
    
    S_super = load_npz(super_adj_path).tocsr()
    print(f"   N√≥s: {S_super.shape[0]:,} | Arestas: {S_super.nnz:,}")

    print("\n--- 2. Carregando Features e Mapeando ---")
    df_feats_track = pd.read_parquet(features_path)
    matching_df = pd.read_parquet(matching_map_path)
    
    # Padronizar nomes
    col_uri = "original_track_uri" if "original_track_uri" in matching_df.columns else "track_uri"
    df_map = matching_df[[col_uri, "super_track_id"]].rename(columns={col_uri: "track_uri"})
    if "track_uri" not in df_feats_track.columns and "id" in df_feats_track.columns:
        df_feats_track = df_feats_track.rename(columns={"id": "track_uri"})

    # Merge (Inner Join)
    df_merged = df_feats_track.merge(df_map, on="track_uri", how="inner")
    
    # Filtrar apenas colunas num√©ricas para agrega√ß√£o
    exclude = ["track_uri", "super_track_id", "pid", "release_year"]
    feature_cols = [c for c in df_merged.columns if c not in exclude and df_merged[c].dtype.kind in 'bifc']
    
    print(f"   Agregando {len(feature_cols)} features (M√©dia)...")
    df_super_feats = df_merged.groupby("super_track_id")[feature_cols].mean().reset_index()

    # Alinhar com a ordem da matriz de adjac√™ncia
    df_super_index = pd.DataFrame({"super_track_id": super_m_index.values})
    df_super_feats = df_super_index.merge(df_super_feats, on="super_track_id", how="left").fillna(0.0)
    
    # Criar Tensores
    x = torch.from_numpy(df_super_feats[feature_cols].to_numpy().astype("float32"))
    
    S_coo = S_super.tocoo()
    edge_index = torch.stack([torch.from_numpy(S_coo.row), torch.from_numpy(S_coo.col)], dim=0).long()

    data = Data(x=x, edge_index=edge_index)
    print("\nData Object Criado:", data)
    
    return data, df_super_feats

# Execu√ß√£o
data_super, df_super_feats = build_supergraph_data(X_TRAIN_PATH, SUPER_ADJ_PATH, SUPER_INDEX_PATH, MATCHING_MAP_PATH)

--- 1. Carregando Estrutura do Grafo ---
   N√≥s: 20,047 | Arestas: 1,002,350

--- 2. Carregando Features e Mapeando ---
   Agregando 49 features (M√©dia)...

Data Object Criado: Data(x=[20047, 49], edge_index=[2, 1002350])


In [7]:
class SuperGraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels=64, out_channels=64, num_layers=2, dropout=0.2, aggr='mean'):
        super().__init__()
        self.dropout_rate = dropout
        self.convs = nn.ModuleList()
        
        # Camada 1
        self.convs.append(SAGEConv(in_channels, hidden_channels, aggr=aggr))
        # Camadas Intermedi√°rias
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels, aggr=aggr))
        # Camada Sa√≠da
        if num_layers > 1:
            self.convs.append(SAGEConv(hidden_channels, out_channels, aggr=aggr))

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            # CORRE√á√ÉO AQUI: Usando F_nn em vez de F
            x = F_nn.relu(x)
            x = F_nn.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x

In [8]:
import random
import os

def set_seed(seed=42):
    """Trava todas as sementes aleat√≥rias para resultados reproduz√≠veis."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"üå± Seed fixada em {seed}")

def train_model(data, hidden_channels=64, epochs=200, lr=0.001, aggr_type='mean', seed=42):
    # 1. Fixar Seed
    set_seed(seed)
    
    print(f"\n--- Iniciando Treino (Aggr: {aggr_type}, Epochs: {epochs}, Hidden: {hidden_channels}) ---")
    
    transform = RandomLinkSplit(
        num_val=0.1, 
        num_test=0.0, 
        is_undirected=False, 
        add_negative_train_samples=False
    )
    train_data, val_data, _ = transform(data)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = SuperGraphSAGE(data.num_features, hidden_channels, hidden_channels, 2, 0.2, aggr_type).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    train_data = train_data.to(device)
    val_data = val_data.to(device)

    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        
        z = model(train_data.x, train_data.edge_index)
        
        pos_edge = train_data.edge_label_index[:, train_data.edge_label == 1]
        neg_edge = torch.randint(0, data.num_nodes, pos_edge.shape, device=device)
        
        pos_out = (z[pos_edge[0]] * z[pos_edge[1]]).sum(dim=-1)
        neg_out = (z[neg_edge[0]] * z[neg_edge[1]]).sum(dim=-1)
        
        logits = torch.cat([pos_out, neg_out])
        labels = torch.cat([torch.ones_like(pos_out), torch.zeros_like(neg_out)])
        
        # CORRE√á√ÉO AQUI: Usando F_nn
        loss = F_nn.binary_cross_entropy_with_logits(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        if epoch % 20 == 0:
            model.eval()
            with torch.no_grad():
                z_val = model(val_data.x, val_data.edge_index)
                
                val_pos = val_data.edge_label_index[:, val_data.edge_label == 1]
                val_neg = torch.randint(0, data.num_nodes, val_pos.shape, device=device)
                
                pos_prob = (z_val[val_pos[0]] * z_val[val_pos[1]]).sum(dim=-1).sigmoid()
                neg_prob = (z_val[val_neg[0]] * z_val[val_neg[1]]).sum(dim=-1).sigmoid()
                
                pos_acc = (pos_prob > 0.5).float().mean()
                neg_acc = (neg_prob < 0.5).float().mean()
                acc = (pos_acc + neg_acc) / 2
                
            print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Val Acc: {acc:.4f} (Pos: {pos_acc:.2f}, Neg: {neg_acc:.2f})")

    model.eval()
    with torch.no_grad():
        final_z = model(data.x.to(device), data.edge_index.to(device)).cpu().numpy()
    return final_z

In [9]:
# MEAN aggregation, 64 canais, 200 √©pocas.
z_final = train_model(
    data_super,
    hidden_channels=64,
    epochs=200,
    lr=0.001,
    aggr_type='mean'
)

print("\nShape dos Embeddings Finais:", z_final.shape)

üå± Seed fixada em 42

--- Iniciando Treino (Aggr: mean, Epochs: 200, Hidden: 64) ---
Epoch 020 | Loss: 0.9909 | Val Acc: 0.5695 (Pos: 0.95, Neg: 0.19)
Epoch 040 | Loss: 0.7274 | Val Acc: 0.6580 (Pos: 0.90, Neg: 0.41)
Epoch 060 | Loss: 0.6671 | Val Acc: 0.6854 (Pos: 0.92, Neg: 0.45)
Epoch 080 | Loss: 0.6357 | Val Acc: 0.7021 (Pos: 0.93, Neg: 0.48)
Epoch 100 | Loss: 0.6116 | Val Acc: 0.7137 (Pos: 0.93, Neg: 0.50)
Epoch 120 | Loss: 0.5927 | Val Acc: 0.7240 (Pos: 0.94, Neg: 0.51)
Epoch 140 | Loss: 0.5760 | Val Acc: 0.7283 (Pos: 0.94, Neg: 0.52)
Epoch 160 | Loss: 0.5661 | Val Acc: 0.7334 (Pos: 0.94, Neg: 0.52)
Epoch 180 | Loss: 0.5557 | Val Acc: 0.7362 (Pos: 0.94, Neg: 0.53)
Epoch 200 | Loss: 0.5490 | Val Acc: 0.7367 (Pos: 0.95, Neg: 0.53)

Shape dos Embeddings Finais: (20047, 64)


In [11]:
print("\n--- 1. Salvando Embeddings das Super-M√∫sicas ---")
# Criar colunas emb_mean_000, emb_mean_001, ...
emb_cols = [f"emb_mean_{i:03d}" for i in range(z_final.shape[1])]

# Montar DataFrame
df_super_emb = df_super_feats[["super_track_id"]].copy()
# Atribuir valores (z_final vem da C√©lula 6)
df_super_emb[emb_cols] = z_final

# Salvar
print(f"   Salvando em: {OUT_SUPER_EMB}")
df_super_emb.to_parquet(OUT_SUPER_EMB, index=False)


print("\n--- 2. Projetando Embeddings para o Treino ---")
# Isso serve para que o Avaliador n√£o precise recalcular o hist√≥rico toda vez

# Carregar Mapa (Track -> Super)
matching_df = pd.read_parquet(MATCHING_MAP_PATH)
# Padronizar nome da coluna de URI
col_uri = "original_track_uri" if "original_track_uri" in matching_df.columns else "track_uri"
matching_df = matching_df.rename(columns={col_uri: "track_uri"})

# Carregar IDs de Treino
df_train = pd.read_parquet(X_TRAIN_PATH)
if "track_uri" not in df_train.columns and "id" in df_train.columns:
    df_train.rename(columns={"id": "track_uri"}, inplace=True)

# Merge: Track -> Super -> Embedding
# 1. Pega tracks de treino e descobre o super-n√≥
df_train_emb = df_train[["track_uri"]].merge(matching_df, on="track_uri", how="inner")
# 2. Pega o embedding do super-n√≥
df_train_emb = df_train_emb.merge(df_super_emb, on="super_track_id", how="inner")

# Valida√ß√£o R√°pida
if len(df_train_emb) == 0:
    print("ALERTA CR√çTICO: O DataFrame de treino ficou vazio ap√≥s o merge!")
    print("   Verifique se os IDs no X_train batem com o matching_map.")
else:
    print(f"   Merge realizado com sucesso: {len(df_train_emb):,} faixas mapeadas.")
    print(f"   Salvando em: {OUT_TRACK_EMB}")
    df_train_emb.to_parquet(OUT_TRACK_EMB, index=False)

print("\n" + "="*50)
print("PIPELINE GRAPHSAGE CONCLU√çDO!")
print("="*50)


--- 1. Salvando Embeddings das Super-M√∫sicas ---
   Salvando em: /Users/lucasborges/Downloads/TCC/graphs/super_item_item/super_embeddings_mean.parquet

--- 2. Projetando Embeddings para o Treino ---
   Merge realizado com sucesso: 324,132 faixas mapeadas.
   Salvando em: /Users/lucasborges/Downloads/TCC/data/processed/track_embeddings_mean.parquet

PIPELINE GRAPHSAGE CONCLU√çDO!
