In [1]:
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
import os

# ==============================================================================
# 1. CARGA DE CONFIGURA√á√ÉO
# ==============================================================================

def find_project_root(anchor_file="conf/config.yaml"):
    """
    Sobe os diret√≥rios a partir do notebook atual at√© encontrar
    a pasta onde 'conf/config.yaml' existe.
    """
    current_path = Path.cwd()
    
    # Tenta no diret√≥rio atual e sobe at√© o raiz do sistema
    for parent in [current_path] + list(current_path.parents):
        potential_config = parent / anchor_file
        if potential_config.exists():
            return parent
            
    raise FileNotFoundError(f"N√£o foi poss√≠vel encontrar a raiz do projeto contendo '{anchor_file}'.")

# 1. Definir BASE_DIR (Raiz do Projeto)
try:
    BASE_DIR = find_project_root("conf/config.yaml")
    print(f"üìÇ Raiz do Projeto encontrada: {BASE_DIR}")
except FileNotFoundError as e:
    # Fallback manual caso a busca autom√°tica falhe (ajuste se necess√°rio)
    print("Busca autom√°tica falhou. Usando fallback.")
    BASE_DIR = Path("/Users/lucasborges/Downloads/TCC")

# 2. Carregar o YAML da pasta conf
CONFIG_PATH = BASE_DIR / "conf/config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# ==============================================================================
# 2. ATALHOS E VARI√ÅVEIS GLOBAIS
# ==============================================================================

# Atalhos dos Dicion√°rios do YAML
# P['raw'] vai virar algo como: /Users/.../TCC/data/raw
P = {k: BASE_DIR / v for k, v in config['paths'].items()} # P de Paths
F = config['files']                                       # F de Files
PM = config['params']                                     # PM de Params

print(f"‚öôÔ∏è Configura√ß√£o carregada de: {CONFIG_PATH}")

# ==============================================================================
# 3. PONTE DE VARI√ÅVEIS
# ==============================================================================

# Caminhos de Arquivos (Apontando para o YAML)
TRAIN_EMB_PATH       = P['processed'] / F['track_embeddings']
NEW_EMB_PATH         = P['processed'] / F['new_track_embeddings']
X_TRAIN_PATH         = P['processed'] / F['train_features']
X_TEST_PATH          = P['processed'] / F['test_features']

# Ajuste conforme onde voc√™ salvou o df_tracks_complete (interim ou processed?)
# Se n√£o estiver no YAML, usa o caminho constru√≠do:
TRACKS_COMPLETE_PATH = P['interim']   / "df_tracks_complete_v5.parquet"

# Caminhos de Grafos
# Verifica se as chaves existem no yaml, sen√£o usa padr√£o
MATCHING_MAP_PATH    = P.get('graphs_coarsened', P['graphs_bipartite']) / F['matching_map']
SUPER_EMB_PATH       = P.get('graphs_super', P['graphs_bipartite'])     / F['super_embeddings']

# Par√¢metros
SEED                 = PM['seed']

# Configura√ß√µes Visuais Padr√£o
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300

üìÇ Raiz do Projeto encontrada: /Users/lucasborges/Downloads/TCC
‚öôÔ∏è Configura√ß√£o carregada de: /Users/lucasborges/Downloads/TCC/conf/config.yaml


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import gc

# --- CONFIGURA√á√ÉO DIN√ÇMICA (Usando P e F) ---
print("Mapeando arquivos para Infer√™ncia (Cold-Start)...")

# Inputs
X_TEST_PATH       = P['processed'] / F['test_features']
X_TRAIN_PATH      = P['processed'] / F['train_features'] # Necess√°rio para calcular centr√≥ides
MATCHING_MAP_PATH = P['graphs_coarsened'] / F['matching_map']
SUPER_EMB_PATH    = P['graphs_super'] / F['super_embeddings']

# Output
OUT_NEW_EMB       = P['processed'] / F['new_track_embeddings']

# Valida√ß√£o
for path in [X_TEST_PATH, MATCHING_MAP_PATH, SUPER_EMB_PATH]:
    if not path.exists():
        print(f"Aviso: Arquivo n√£o encontrado: {path.name}")

print(f"Output definido para: {OUT_NEW_EMB.name}")

Mapeando arquivos para Infer√™ncia (Cold-Start)...
Output definido para: new_track_embeddings_mean.parquet


In [6]:
def get_super_features_light():
    """
    Recalcula as features m√©dias dos super-n√≥s da forma mais leve poss√≠vel
    e deleta o X_train da mem√≥ria imediatamente.
    """
    print("Recalculando features dos Super-N√≥s (Modo Econ√¥mico)...")
    
    # 1. Carregar Map
    map_df = pd.read_parquet(MATCHING_MAP_PATH)
    col_uri = "original_track_uri" if "original_track_uri" in map_df.columns else "track_uri"
    map_df = map_df.rename(columns={col_uri: "track_uri"})[[ "track_uri", "super_track_id" ]]
    
    # 2. Carregar X_train
    # Verifica se existe, sen√£o avisa
    if not X_TRAIN_PATH.exists():
        raise FileNotFoundError(f"X_train n√£o encontrado em {X_TRAIN_PATH}")
        
    df_train = pd.read_parquet(X_TRAIN_PATH)
    if "track_uri" not in df_train.columns and "id" in df_train.columns:
        df_train = df_train.rename(columns={"id": "track_uri"})
        
    # 3. Merge
    print(f"   Cruzando {len(df_train):,} faixas com mapa...")
    merged = df_train.merge(map_df, on="track_uri", how="inner")
    
    # Limpar mem√≥ria imediata
    del df_train, map_df
    gc.collect()
    
    # Identificar colunas num√©ricas (features de √°udio/meta)
    cols = [c for c in merged.columns if c not in ["track_uri", "super_track_id"] and merged[c].dtype.kind in 'bifc']
    
    # 4. Groupby e Mean (C√°lculo do Centr√≥ide)
    print("   Calculando centr√≥ides...")
    super_feats = merged.groupby("super_track_id")[cols].mean().reset_index()
    
    # Limpar merged
    del merged
    gc.collect()
    
    return super_feats, cols

def resolve_cold_start_batch(df_test, df_super_feats, feat_cols, batch_size=2000):
    """
    Roda KNN em batches pequenos com limpeza agressiva de mem√≥ria.
    """
    print(f"Iniciando KNN para {len(df_test):,} m√∫sicas (Batch: {batch_size})...")
    
    # Preparar Dados de Refer√™ncia (Super-N√≥s)
    X_super = df_super_feats[feat_cols].to_numpy().astype('float32')
    super_ids = df_super_feats["super_track_id"].to_numpy()
    
    # Normalizar para Espa√ßo Euclidiano Padr√£o
    scaler = StandardScaler()
    X_super_scaled = scaler.fit_transform(X_super)
    
    # Treinar KNN (Leve - 1 vizinho mais pr√≥ximo)
    # n_jobs=1 √© proposital para evitar overhead em loops pequenos
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean', n_jobs=1) 
    knn.fit(X_super_scaled)
    
    results = []
    
    # Loop em Batches
    for i in tqdm(range(0, len(df_test), batch_size)):
        # Slice seguro
        batch = df_test.iloc[i : i + batch_size].copy()
        
        # Features do Batch
        X_batch = batch[feat_cols].fillna(0).to_numpy().astype('float32')
        X_batch_scaled = scaler.transform(X_batch)
        
        # Busca
        _, indices = knn.kneighbors(X_batch_scaled)
        
        # Atribui√ß√£o do ID do Super-N√≥
        matched_super_ids = super_ids[indices.flatten()]
        
        # Resultado Parcial
        res_df = batch[["track_uri"]].copy()
        res_df["super_track_id"] = matched_super_ids
        results.append(res_df)
        
        # Limpeza do Loop
        del X_batch, X_batch_scaled, indices, batch
        
    return pd.concat(results, ignore_index=True)

In [7]:
# --- EXECU√á√ÉO ---

# 1. Obter Features dos Centr√≥ides (Super-N√≥s)
# Isso reconstr√≥i a "assinatura m√©dia" de cada g√™nero/comunidade
df_super_feats, feat_cols = get_super_features_light()
print(f"Features de Super-N√≥s prontas: {df_super_feats.shape}")

# 2. Carregar Teste e Mapa
print("Carregando Teste e Mapa...")
df_test = pd.read_parquet(X_TEST_PATH)
if "track_uri" not in df_test.columns and "id" in df_test.columns: 
    df_test = df_test.rename(columns={"id": "track_uri"})

map_df = pd.read_parquet(MATCHING_MAP_PATH)
col_uri = "original_track_uri" if "original_track_uri" in map_df.columns else "track_uri"
map_df = map_df.rename(columns={col_uri: "track_uri"})

# 3. Separar: Quem tem Lookup (j√° existia no grafo) vs Quem precisa de KNN (Cold-Start Real)
print(" Separando casos...")
# Lookup: M√∫sicas de teste que, por acaso, cairam no grafo (raro em cold-start estrito, mas poss√≠vel)
lookup_match = df_test.merge(map_df, on="track_uri", how="inner")[["track_uri", "super_track_id"]]
known_uris = set(lookup_match["track_uri"])

# Orphans: M√∫sicas realmente novas
df_orphans = df_test[~df_test["track_uri"].isin(known_uris)].copy()

print(f"   Via Lookup direto: {len(lookup_match):,}")
print(f"   Via KNN (Cold-Start): {len(df_orphans):,}")

# Liberar mem√≥ria
del map_df, df_test
gc.collect()

# 4. Rodar KNN nas √≥rf√£s
if len(df_orphans) > 0:
    knn_match = resolve_cold_start_batch(df_orphans, df_super_feats, feat_cols, batch_size=2000)
    final_mapping = pd.concat([lookup_match, knn_match], ignore_index=True)
else:
    final_mapping = lookup_match

print(f"Mapeamento conclu√≠do. Total: {len(final_mapping):,}")

# Limpar mem√≥ria antes do merge final
del df_orphans, df_super_feats, lookup_match
if 'knn_match' in locals(): del knn_match
gc.collect()

# 5. Colocar os Embeddings (GraphSAGE)
print("Anexando Embeddings Finais...")
# Carrega os embeddings aprendidos pela GNN (Notebook 6)
df_super_emb = pd.read_parquet(SUPER_EMB_PATH)

# Merge: Track Nova -> Super N√≥ (KNN) -> Embedding (GNN)
df_final = final_mapping.merge(df_super_emb, on="super_track_id", how="inner")

print(f"Salvando {len(df_final):,} linhas em: {OUT_NEW_EMB}")
df_final.to_parquet(OUT_NEW_EMB, index=False)

print("\nCONCLU√çDO! Embeddings de teste gerados.")

Recalculando features dos Super-N√≥s (Modo Econ√¥mico)...
   Cruzando 327,208 faixas com mapa...
   Calculando centr√≥ides...
Features de Super-N√≥s prontas: (20047, 50)
Carregando Teste e Mapa...
 Separando casos...
   Via Lookup direto: 5,377
   Via KNN (Cold-Start): 57,514
Iniciando KNN para 57,514 m√∫sicas (Batch: 2000)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29/29 [00:01<00:00, 19.32it/s]


Mapeamento conclu√≠do. Total: 62,891
Anexando Embeddings Finais...
Salvando 62,891 linhas em: /Users/lucasborges/Downloads/TCC/data/processed/new_track_embeddings_mean.parquet

CONCLU√çDO! Embeddings de teste gerados.
