In [1]:
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================================================================
# 1. CARGA DE CONFIGURA√á√ÉO
# ==============================================================================

def find_project_root(anchor_file="conf/config.yaml"):
    """
    Sobe os diret√≥rios a partir do notebook atual at√© encontrar
    a pasta onde 'conf/config.yaml' existe.
    """
    current_path = Path.cwd()
    
    # Tenta no diret√≥rio atual e sobe at√© o raiz do sistema
    for parent in [current_path] + list(current_path.parents):
        potential_config = parent / anchor_file
        if potential_config.exists():
            return parent
            
    raise FileNotFoundError(f"N√£o foi poss√≠vel encontrar a raiz do projeto contendo '{anchor_file}'.")

# 1. Definir BASE_DIR (Raiz do Projeto)
try:
    BASE_DIR = find_project_root("conf/config.yaml")
    print(f"üìÇ Raiz do Projeto encontrada: {BASE_DIR}")
except FileNotFoundError as e:
    # Fallback manual caso a busca autom√°tica falhe (ajuste se necess√°rio)
    print("Busca autom√°tica falhou. Usando fallback.")
    BASE_DIR = Path("/Users/lucasborges/Downloads/TCC")

# 2. Carregar o YAML da pasta conf
CONFIG_PATH = BASE_DIR / "conf/config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# ==============================================================================
# 2. ATALHOS E VARI√ÅVEIS GLOBAIS
# ==============================================================================

# Atalhos dos Dicion√°rios do YAML
P = {k: BASE_DIR / v for k, v in config['paths'].items()} # P de Paths
F = config['files']                                       # F de Files
PM = config['params']                                     # PM de Params

print(f"‚öôÔ∏è Configura√ß√£o carregada de: {CONFIG_PATH}")

# ==============================================================================
# 3. PONTE DE VARI√ÅVEIS
# ==============================================================================

# Caminhos de Arquivos (Apontando para o YAML)
TRAIN_EMB_PATH       = P['processed'] / F['track_embeddings']
NEW_EMB_PATH         = P['processed'] / F['new_track_embeddings']
X_TRAIN_PATH         = P['processed'] / F['train_features']
X_TEST_PATH          = P['processed'] / F['test_features']
MATCHING_MAP_PATH    = P.get('graphs_coarsened', P['graphs_bipartite']) / F['matching_map']
SUPER_EMB_PATH       = P.get('graphs_super', P['graphs_bipartite'])     / F['super_embeddings']

# Par√¢metros
SEED                 = PM['seed']

# Configura√ß√µes Visuais Padr√£o
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300

üìÇ Raiz do Projeto encontrada: /Users/lucasborges/Downloads/TCC
‚öôÔ∏è Configura√ß√£o carregada de: /Users/lucasborges/Downloads/TCC/conf/config.yaml


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import gc

# --- CONFIGURA√á√ÉO DIN√ÇMICA (Usando P e F) ---
print("Mapeando arquivos para Infer√™ncia (Cold-Start)...")

# Inputs
X_TEST_PATH       = P['processed'] / F['test_features']
X_TRAIN_PATH      = P['processed'] / F['train_features'] # Necess√°rio para calcular centr√≥ides
MATCHING_MAP_PATH = P['graphs_coarsened'] / F['matching_map']
SUPER_EMB_PATH    = P['graphs_super'] / F['super_embeddings']

# Output
OUT_NEW_EMB       = P['processed'] / F['new_track_embeddings']

# Valida√ß√£o
for path in [X_TEST_PATH, MATCHING_MAP_PATH, SUPER_EMB_PATH]:
    if not path.exists():
        print(f"Aviso: Arquivo n√£o encontrado: {path.name}")

print(f"Output definido para: {OUT_NEW_EMB.name}")

Mapeando arquivos para Infer√™ncia (Cold-Start)...
Output definido para: new_track_embeddings_mean.parquet


In [3]:
def resolve_cold_start_weighted(df_test, df_super_feats, df_super_emb, feat_cols, batch_size=2000, k=5):
    print(f"Iniciando KNN Ponderado (K={k}) para {len(df_test):,} m√∫sicas...")

    # 1. Prepara√ß√£o
    df_super_feats = df_super_feats.sort_values("super_track_id").reset_index(drop=True)
    X_super = df_super_feats[feat_cols].to_numpy().astype('float32')
    
    scaler = StandardScaler()
    X_super_scaled = scaler.fit_transform(X_super)
    
    knn = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=-1)
    knn.fit(X_super_scaled)
    
    emb_cols = [c for c in df_super_emb.columns if c.startswith("emb_")]
    emb_lookup = df_super_feats[["super_track_id"]].merge(df_super_emb, on="super_track_id", how="left")
    emb_matrix = emb_lookup[emb_cols].fillna(0).to_numpy().astype('float32')
    
    results = []
    
    for i in tqdm(range(0, len(df_test), batch_size)):
        batch = df_test.iloc[i : i + batch_size].copy()
        X_batch = batch[feat_cols].fillna(0).to_numpy().astype('float32')
        X_batch_scaled = scaler.transform(X_batch)
        
        dists, indices = knn.kneighbors(X_batch_scaled)
        
        # --- C√ÅLCULO DE PESOS (Softmax Est√°vel) ---
        # Mantendo sua l√≥gica excelente de estabilidade num√©rica
        neg_dists = -dists
        max_neg_dists = np.max(neg_dists, axis=1, keepdims=True)
        exp_x = np.exp(neg_dists - max_neg_dists)
        weights = exp_x / np.sum(exp_x, axis=1, keepdims=True) # [Batch, K]
        
        # --- INTERPOLA√á√ÉO VETORIZADA ---
        # Multiplica√ß√£o de matrizes 3D para evitar o loop j in range(k)
        # neighbor_embs shape: [Batch, K, Emb_Dim]
        neighbor_embs = emb_matrix[indices] 
        
        # Pondera√ß√£o: (Batch, 1, K) @ (Batch, K, Emb_Dim) -> (Batch, 1, Emb_Dim)
        batch_embs = np.matmul(weights[:, np.newaxis, :], neighbor_embs).squeeze(1)
        
        # --- RE-NORMALIZA√á√ÉO L2 ---
        # Garante que a m√∫sica nova tenha a mesma escala m√©trica que as m√∫sicas de treino
        norms = np.linalg.norm(batch_embs, axis=1, keepdims=True)
        batch_embs = batch_embs / (norms + 1e-10) # 1e-10 evita divis√£o por zero
        
        res_df = batch[["track_uri"]].copy()
        res_df[emb_cols] = batch_embs
        results.append(res_df)
        
    return pd.concat(results, ignore_index=True)


def get_super_features_light():
    """
    Recalcula as features m√©dias dos super-n√≥s da forma mais leve poss√≠vel
    e deleta o X_train da mem√≥ria imediatamente.
    """
    print("Recalculando features dos Super-N√≥s (Modo Econ√¥mico)...")
    
    # 1. Carregar Map (Track -> Super_Node)
    # Usamos MATCHING_MAP_PATH definido na c√©lula mestra do S6.1
    map_df = pd.read_parquet(MATCHING_MAP_PATH)
    col_uri = "original_track_uri" if "original_track_uri" in map_df.columns else "track_uri"
    map_df = map_df.rename(columns={col_uri: "track_uri"})[["track_uri", "super_track_id"]]
    
    # 2. Carregar X_train (Features de √Åudio originais)
    if not X_TRAIN_PATH.exists():
        raise FileNotFoundError(f"X_train n√£o encontrado em {X_TRAIN_PATH}")
        
    df_train = pd.read_parquet(X_TRAIN_PATH)
    if "track_uri" not in df_train.columns and "id" in df_train.columns:
        df_train = df_train.rename(columns={"id": "track_uri"})
        
    # 3. Merge para associar features aos super-n√≥s
    print(f"   Cruzando {len(df_train):,} faixas com mapa...")
    merged = df_train.merge(map_df, on="track_uri", how="inner")
    
    # Limpeza imediata de mem√≥ria para evitar crash
    del df_train, map_df
    gc.collect()
    
    # Identificar colunas num√©ricas (features que o KNN usar√°)
    cols = [c for c in merged.columns if c not in ["track_uri", "super_track_id"] and merged[c].dtype.kind in 'bifc']
    
    # 4. Groupby e Mean (C√°lculo do Centr√≥ide de √Åudio do Super-N√≥)
    print("   Calculando centr√≥ides...")
    super_feats = merged.groupby("super_track_id")[cols].mean().reset_index()
    
    del merged
    gc.collect()
    
    return super_feats, cols

In [4]:
# --- EXECU√á√ÉO ---

# 1. Carregar Embeddings GNN
print("Carregando Embeddings dos Super-N√≥s...")
df_super_emb = pd.read_parquet(SUPER_EMB_PATH)

# 2. Obter Features dos Centr√≥ides (Super-N√≥s)
df_super_feats, feat_cols = get_super_features_light()
print(f"Features de Super-N√≥s prontas: {df_super_feats.shape}")

# 3. Carregar Teste e Mapa
print("Carregando Teste e Mapa...")
df_test = pd.read_parquet(X_TEST_PATH)
if "track_uri" not in df_test.columns and "id" in df_test.columns: 
    df_test = df_test.rename(columns={"id": "track_uri"})

map_df = pd.read_parquet(MATCHING_MAP_PATH)
col_uri = "original_track_uri" if "original_track_uri" in map_df.columns else "track_uri"
map_df = map_df.rename(columns={col_uri: "track_uri"})

# 4. Separar Casos
print(" Separando casos (Lookup vs KNN)...")
# Lookup: M√∫sicas que j√° cairam no grafo
lookup_ids = df_test.merge(map_df, on="track_uri", how="inner")[["track_uri", "super_track_id"]]
known_uris = set(lookup_ids["track_uri"])

# Orphans: M√∫sicas novas
df_orphans = df_test[~df_test["track_uri"].isin(known_uris)].copy()

print(f"   Via Lookup direto: {len(lookup_ids):,}")
print(f"   Via KNN (Cold-Start): {len(df_orphans):,}")

final_parts = []

# CASO A: Lookup Direto
if len(lookup_ids) > 0:
    print("Processando Lookup Direto...")
    # Merge simples
    df_lookup = lookup_ids.merge(df_super_emb, on="super_track_id", how="inner")
    if "super_track_id" in df_lookup.columns:
        df_lookup = df_lookup.drop(columns=["super_track_id"])
    final_parts.append(df_lookup)

# CASO B: KNN Ponderado
if len(df_orphans) > 0:
    print("Processando KNN Ponderado...")
    df_knn = resolve_cold_start_weighted(
        df_orphans, 
        df_super_feats, 
        df_super_emb, 
        feat_cols, 
        batch_size=2000, 
        k=5 
    )
    final_parts.append(df_knn)

# 5. Concatenar e Salvar
if final_parts:
    df_final = pd.concat(final_parts, ignore_index=True)
    print(f"Salvando {len(df_final):,} linhas em: {OUT_NEW_EMB}")
    df_final.to_parquet(OUT_NEW_EMB, index=False)
    print("\nCONCLU√çDO! Embeddings de teste gerados (Refined Approach).")
else:
    print("Aviso: Nenhuma m√∫sica processada.")

Carregando Embeddings dos Super-N√≥s...
Recalculando features dos Super-N√≥s (Modo Econ√¥mico)...
   Cruzando 324,305 faixas com mapa...
   Calculando centr√≥ides...
Features de Super-N√≥s prontas: (20641, 52)
Carregando Teste e Mapa...
 Separando casos (Lookup vs KNN)...
   Via Lookup direto: 0
   Via KNN (Cold-Start): 60,417
Processando KNN Ponderado...
Iniciando KNN Ponderado (K=5) para 60,417 m√∫sicas...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [00:02<00:00, 14.34it/s]


Salvando 60,417 linhas em: /Users/lucasborges/Downloads/TCC/data/processed/new_track_embeddings_mean.parquet

CONCLU√çDO! Embeddings de teste gerados (Refined Approach).


In [5]:
import numpy as np
import pandas as pd

def validate_embeddings_norm(df, name="Dataset"):
    # 1. Identificar colunas de embedding (ex: emb_mean_000...)
    emb_cols = [c for c in df.columns if c.startswith("emb_")]
    if not emb_cols:
        print(f"‚ùå {name}: Nenhuma coluna de embedding encontrada.")
        return
    
    # 2. Converter para numpy
    embs = df[emb_cols].to_numpy()
    
    # 3. Calcular a norma L2 de cada linha
    norms = np.linalg.norm(embs, axis=1)
    
    # 4. Estat√≠sticas
    mean_norm = np.mean(norms)
    std_norm = np.std(norms)
    min_norm = np.min(norms)
    max_norm = np.max(norms)
    
    # 5. Verifica√ß√£o de Toler√¢ncia (erro de ponto flutuante)
    # Todos devem estar muito pr√≥ximos de 1.0
    is_valid = np.allclose(norms, 1.0, atol=1e-3)
    
    print(f"=== Valida√ß√£o: {name} ===")
    print(f"Norma M√©dia: {mean_norm:.6f} (¬±{std_norm:.6f})")
    print(f"Range: [{min_norm:.6f} a {max_norm:.6f}]")
    
    if is_valid:
        print("‚úÖ SUCESSO: Todos os vetores possuem norma unit√°ria (1.0).")
    else:
        print("‚ö†Ô∏è ALERTA: Detectados vetores fora da norma unit√°ria.")
        print("   Isso pode causar vi√©s na Similaridade de Cosseno no S7.")
    print("-" * 30)

# --- EXECU√á√ÉO ---
# Verificando os dois arquivos principais gerados/usados no S6.1
try:
    # Substitua pelos nomes reais dos seus DataFrames se eles ainda estiverem em mem√≥ria
    # Ou carregue-os dos caminhos definidos (OUT_TRACK_EMB e o retorno da resolve_cold_start)
    
    # Exemplo com os arquivos salvos:
    df_train_val = pd.read_parquet(P['processed'] / F['track_embeddings'])
    df_test_val = pd.read_parquet(P['processed'] / F['new_track_embeddings'])
    
    validate_embeddings_norm(df_train_val, "Tracks de Treino (GraphSAGE)")
    validate_embeddings_norm(df_test_val, "Tracks Novas (Weighted k-NN)")
    
except Exception as e:
    print(f"Erro ao carregar arquivos para valida√ß√£o: {e}")

=== Valida√ß√£o: Tracks de Treino (GraphSAGE) ===
Norma M√©dia: 1.000000 (¬±0.000000)
Range: [1.000000 a 1.000000]
‚úÖ SUCESSO: Todos os vetores possuem norma unit√°ria (1.0).
------------------------------
=== Valida√ß√£o: Tracks Novas (Weighted k-NN) ===
Norma M√©dia: 1.000000 (¬±0.000000)
Range: [1.000000 a 1.000000]
‚úÖ SUCESSO: Todos os vetores possuem norma unit√°ria (1.0).
------------------------------
