# Sistema Interativo de Recomenda√ß√£o de Filmes

**Objetivo**: Comparar 3 abordagens de recomenda√ß√£o + Interface interativa

**Dataset**: MovieLens 990k ratings (HuggingFace)

**Estrutura**:
- **Parte 1 (C√©lulas 1-15)**: Implementa√ß√£o t√©cnica + m√©tricas
- **Parte 2 (C√©lulas 16-21)**: Sistema interativo com Jupyter Widgets

## PARTE 1: Implementa√ß√£o T√©cnica

### Fase 1: Setup e An√°lise Explorat√≥ria

In [None]:
# C√©lula 1: Importa√ß√µes
import os
import gc
import pickle
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# ML & Embeddings
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer

# Dataset
from datasets import load_dataset

# Widgets (Parte 2)
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

print("‚úÖ Todas as bibliotecas importadas com sucesso!")

In [None]:
# C√©lula 2: Configura√ß√£o Global

# === PATHS ===
BASE_DIR = Path('.')
CACHE_DIR = BASE_DIR / 'cache'
RESULTS_DIR = BASE_DIR / 'results'

for dir_path in [CACHE_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# === PAR√ÇMETROS DE AVALIA√á√ÉO ===
K = 10  # Top-K recomenda√ß√µes
MIN_RATING_THRESHOLD = 3.0  # Considera "relevante" (ROBUSTA: era 4.0)
RANDOM_STATE = 42

# === MODELOS ===
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L12-v2'  # 384-dim, ~120MB (ROBUSTA: era L6-v2)

# === OTIMIZA√á√ïES DE MEM√ìRIA ===
EMBEDDING_BATCH_SIZE = 64  # Mini-batches para embeddings (ROBUSTA: era 32)
SIMILARITY_CHUNK_SIZE = 2000  # Processar 2000 filmes por vez (ROBUSTA: era 1000)
TOP_K_SIMILAR = 2000  # Guardar top-150 similares por filme (ROBUSTA: era 100)

# === PAR√ÇMETROS DO H√çBRIDO ===
ALPHA_MIN_RATINGS = 5  # M√≠nimo de ratings para come√ßar a confiar no colaborativo
ALPHA_MAX_RATINGS = 50  # M√°ximo para Œ± = 0.9

print("‚úÖ Configura√ß√£o ROBUSTA completa!")
print(f"   - Cache: {CACHE_DIR.resolve()}")
print(f"   - Results: {RESULTS_DIR.resolve()}")
print(f"   - Otimiza√ß√µes: Batch={EMBEDDING_BATCH_SIZE}, Chunks={SIMILARITY_CHUNK_SIZE}, Top-K={TOP_K_SIMILAR}")
print(f"   - Modelo: {EMBEDDING_MODEL}")
print(f"   - Threshold: {MIN_RATING_THRESHOLD}")

In [None]:
# C√©lula 3: Download e Carregamento do Dataset (com cache)

dataset_cache = CACHE_DIR / 'dataset_processed.pkl'

if dataset_cache.exists():
    print("üìÇ Carregando dataset do cache...")
    with open(dataset_cache, 'rb') as f:
        data = pickle.load(f)
        train_df = data['train']
        test_df = data['test']
        movies_df = data['movies']
else:
    print("üì• Baixando dataset do HuggingFace (pode demorar ~1-2 min)...")
    dataset = load_dataset("ashraq/movielens_ratings")

    train_df = dataset['train'].to_pandas()
    test_df = dataset['validation'].to_pandas()

    print("üîÑ Processando metadados dos filmes...")
    # Agregar informa√ß√µes √∫nicas de filmes
    movies_df = train_df.groupby('movie_id').agg({
        'title': 'first',
        'genres': 'first',
        'imdbId': 'first',
        'tmdbId': 'first'
    }).reset_index()

    # Criar texto para embeddings
    movies_df['text'] = movies_df['title'] + ' ' + \
        movies_df['genres'].str.replace('|', ' ', regex=False)

    # Cache
    with open(dataset_cache, 'wb') as f:
        pickle.dump({'train': train_df, 'test': test_df,
                    'movies': movies_df}, f)
    print(f"üíæ Dataset salvo em cache: {dataset_cache}")

print(f"\n‚úÖ Dataset carregado!")
print(f"   - Train: {len(train_df):,} ratings")
print(f"   - Test: {len(test_df):,} ratings")
print(f"   - Filmes √∫nicos: {len(movies_df):,}")
print(f"   - Usu√°rios √∫nicos: {train_df['user_id'].nunique():,}")

In [None]:
# [UTILIT√ÅRIO] Limpar Cache para Reprocessamento

# IMPORTANTE: Execute esta c√©lula APENAS quando precisar regenerar os arquivos
# de cache ap√≥s alterar par√¢metros na C√©lula 2

import os

def clear_cache_files():
    """
    Remove arquivos de cache que dependem dos par√¢metros configur√°veis.
    Use ap√≥s alterar MIN_RATING_THRESHOLD, EMBEDDING_MODEL, TOP_K_SIMILAR, etc.
    """
    files_to_remove = [
        'movie_embeddings.pkl',      # Depende de EMBEDDING_MODEL
        'item_similarity_topk.pkl',  # Depende de TOP_K_SIMILAR e SIMILARITY_CHUNK_SIZE
        'metrics_collaborative.pkl', # Depende de MIN_RATING_THRESHOLD
        'metrics_content.pkl',       # Depende de MIN_RATING_THRESHOLD
        'metrics_hybrid.pkl'         # Depende de MIN_RATING_THRESHOLD
    ]
    
    removed = []
    not_found = []
    
    for filename in files_to_remove:
        filepath = CACHE_DIR / filename
        try:
            if filepath.exists():
                os.remove(filepath)
                removed.append(filename)
            else:
                not_found.append(filename)
        except Exception as e:
            print(f"‚ùå Erro ao remover {filename}: {e}")
    
    print("üóëÔ∏è  LIMPEZA DE CACHE COMPLETA")
    print("="*60)
    
    if removed:
        print(f"\n‚úÖ Arquivos removidos ({len(removed)}):")
        for f in removed:
            print(f"   - {f}")
    
    if not_found:
        print(f"\n‚ÑπÔ∏è  Arquivos j√° n√£o existiam ({len(not_found)}):")
        for f in not_found:
            print(f"   - {f}")
    
    print("\n" + "="*60)
    print("üìã PR√ìXIMOS PASSOS:")
    print("="*60)
    print("1. Re-execute a C√©lula 2 para carregar a nova configura√ß√£o")
    print("2. Re-execute as c√©lulas afetadas:")
    print("   - C√©lula 6: Similaridade item-item (nova com TOP_K=150)")
    print("   - C√©lula 8: Embeddings (novo modelo L12-v2)")
    print("   - C√©lulas 7, 10, 12: M√©tricas (novo threshold 3.5)")
    print("="*60)

# Descomente a linha abaixo para executar a limpeza:
# clear_cache_files()

print("‚ö†Ô∏è  C√©lula de limpeza de cache carregada.")
print("Para limpar o cache, descomente a √∫ltima linha e execute esta c√©lula.")

In [None]:
# C√©lula 4: An√°lise Explorat√≥ria

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Distribui√ß√£o de Ratings
ax = axes[0, 0]
train_df['rating'].hist(bins=10, ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Rating')
ax.set_ylabel('Frequ√™ncia')
ax.set_title('Distribui√ß√£o de Ratings')
ax.axvline(MIN_RATING_THRESHOLD, color='red', linestyle='--', label=f'Threshold={MIN_RATING_THRESHOLD}')
ax.legend()

# 2. Top-20 Filmes Mais Avaliados
ax = axes[0, 1]
top_movies = train_df['movie_id'].value_counts().head(20)
movie_titles = [movies_df[movies_df['movie_id'] == mid]['title'].values[0][:30] for mid in top_movies.index]
ax.barh(range(20), top_movies.values, color='coral')
ax.set_yticks(range(20))
ax.set_yticklabels(movie_titles, fontsize=8)
ax.set_xlabel('N¬∫ de Avalia√ß√µes')
ax.set_title('Top-20 Filmes Mais Avaliados')
ax.invert_yaxis()

# 3. Distribui√ß√£o de G√™neros
ax = axes[1, 0]
all_genres = []
for genres_str in movies_df['genres'].dropna():
    all_genres.extend(genres_str.split('|'))
genre_counts = Counter(all_genres).most_common(15)
ax.bar([g[0] for g in genre_counts], [g[1] for g in genre_counts], color='mediumseagreen')
ax.set_xlabel('G√™nero')
ax.set_ylabel('N¬∫ de Filmes')
ax.set_title('Top-15 G√™neros')
ax.tick_params(axis='x', rotation=45)

# 4. Sparsidade da Matriz
ax = axes[1, 1]
n_users = train_df['user_id'].nunique()
n_movies = train_df['movie_id'].nunique()
n_ratings = len(train_df)
sparsity = 1 - (n_ratings / (n_users * n_movies))

ax.text(0.5, 0.6, f"Sparsidade da Matriz", ha='center', fontsize=16, fontweight='bold')
ax.text(0.5, 0.4, f"{sparsity*100:.2f}%", ha='center', fontsize=48, color='red')
ax.text(0.5, 0.25, f"{n_users:,} usu√°rios √ó {n_movies:,} filmes", ha='center', fontsize=12)
ax.text(0.5, 0.15, f"{n_ratings:,} ratings (~{n_ratings/(n_users*n_movies)*100:.3f}% preenchido)", 
        ha='center', fontsize=10, style='italic')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'exploratory_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ An√°lise explorat√≥ria salva em: {RESULTS_DIR / 'exploratory_analysis.png'}")

### Fase 2: Abordagem 1 - Filtragem Colaborativa Item-Item

In [None]:
# C√©lula 5: Construir Matriz Esparsa Usu√°rio-Item

matrix_cache = CACHE_DIR / 'user_item_matrix.pkl'

if matrix_cache.exists():
    print("üìÇ Carregando matriz do cache...")
    with open(matrix_cache, 'rb') as f:
        matrix_data = pickle.load(f)
        R_train = matrix_data['R_train']
        user_to_idx = matrix_data['user_to_idx']
        idx_to_user = matrix_data['idx_to_user']
        movie_to_idx = matrix_data['movie_to_idx']
        idx_to_movie = matrix_data['idx_to_movie']
else:
    print("üîÑ Construindo matriz esparsa usu√°rio-item...")
    
    # Criar mapeamentos
    unique_users = sorted(train_df['user_id'].unique())
    unique_movies = sorted(train_df['movie_id'].unique())
    
    user_to_idx = {u: i for i, u in enumerate(unique_users)}
    idx_to_user = {i: u for u, i in user_to_idx.items()}
    movie_to_idx = {m: i for i, m in enumerate(unique_movies)}
    idx_to_movie = {i: m for m, i in movie_to_idx.items()}
    
    # Construir matriz esparsa
    rows = train_df['user_id'].map(user_to_idx).values
    cols = train_df['movie_id'].map(movie_to_idx).values
    data = train_df['rating'].values
    
    R_train = csr_matrix((data, (rows, cols)), 
                         shape=(len(unique_users), len(unique_movies)))
    
    # Cache
    with open(matrix_cache, 'wb') as f:
        pickle.dump({
            'R_train': R_train,
            'user_to_idx': user_to_idx,
            'idx_to_user': idx_to_user,
            'movie_to_idx': movie_to_idx,
            'idx_to_movie': idx_to_movie
        }, f)
    print(f"üíæ Matriz salva em cache: {matrix_cache}")

print(f"\n‚úÖ Matriz constru√≠da: {R_train.shape}")
print(f"   - Mem√≥ria: ~{R_train.data.nbytes / 1024**2:.2f} MB")
print(f"   - Valores n√£o-nulos: {R_train.nnz:,} ({R_train.nnz / np.prod(R_train.shape) * 100:.3f}% densidade)")

In [None]:
# C√©lula 6: Calcular Similaridade Item-Item (Chunked + Top-K Sparse)

similarity_cache = CACHE_DIR / 'item_similarity_topk.pkl'

if similarity_cache.exists():
    print("üìÇ Carregando similaridade do cache...")
    with open(similarity_cache, 'rb') as f:
        item_similarity_topk = pickle.load(f)
else:
    print("üîÑ Calculando similaridade item-item (chunked, pode demorar ~10-15 min)...")
    
    n_movies = R_train.shape[1]
    R_items = R_train.T  # Transpor: filmes √ó usu√°rios
    
    # Guardar apenas top-K similares (economia de mem√≥ria)
    item_similarity_topk = {}
    
    for start_idx in tqdm(range(0, n_movies, SIMILARITY_CHUNK_SIZE), desc="Chunks"):
        end_idx = min(start_idx + SIMILARITY_CHUNK_SIZE, n_movies)
        chunk = R_items[start_idx:end_idx]
        
        # Similaridade do chunk com TODOS os filmes
        chunk_sim = cosine_similarity(chunk, R_items, dense_output=True)
        
        # Para cada filme do chunk, guardar top-K
        for i, movie_idx in enumerate(range(start_idx, end_idx)):
            # Excluir o pr√≥prio filme
            sim_scores = chunk_sim[i]
            sim_scores[movie_idx] = -1
            
            # Top-K √≠ndices e scores
            top_k_indices = np.argsort(sim_scores)[::-1][:TOP_K_SIMILAR]
            top_k_scores = sim_scores[top_k_indices]
            
            item_similarity_topk[movie_idx] = {
                'indices': top_k_indices,
                'scores': top_k_scores
            }
        
        # Liberar mem√≥ria
        del chunk_sim
        gc.collect()
    
    # Cache
    with open(similarity_cache, 'wb') as f:
        pickle.dump(item_similarity_topk, f)
    print(f"üíæ Similaridade salva em cache: {similarity_cache}")

print(f"\n‚úÖ Similaridade calculada!")
print(f"   - {len(item_similarity_topk)} filmes √ó top-{TOP_K_SIMILAR} similares")
print(f"   - Mem√≥ria estimada: ~{len(item_similarity_topk) * TOP_K_SIMILAR * 8 / 1024**2:.2f} MB")

In [None]:
# C√©lula 7 OTIMIZADA: Implementar e Avaliar Filtragem Colaborativa
# VERS√ÉO R√ÅPIDA: ~5-10 minutos ao inv√©s de 5 horas

def recommend_collaborative(user_id, k=10, return_scores=False):
    """
    Recomenda filmes usando filtragem colaborativa item-item
    VERS√ÉO OTIMIZADA: N√£o processa filme por filme

    Args:
        user_id: ID do usu√°rio
        k: N√∫mero de recomenda√ß√µes
        return_scores: Se True, retorna (movie_ids, scores)

    Returns:
        list: IDs dos filmes recomendados (ou tuple se return_scores=True)
    """
    if user_id not in user_to_idx:
        # Usu√°rio novo: retornar filmes mais populares
        top_movies = train_df['movie_id'].value_counts().head(k).index.tolist()
        if return_scores:
            return top_movies, [1.0] * len(top_movies)
        return top_movies

    user_idx = user_to_idx[user_id]
    user_ratings = R_train[user_idx].toarray().flatten()

    # Encontrar quais filmes o usu√°rio avaliou
    rated_movie_indices = np.where(user_ratings > 0)[0]

    if len(rated_movie_indices) == 0:
        # Usu√°rio sem ratings: retornar populares
        top_movies = train_df['movie_id'].value_counts().head(k).index.tolist()
        if return_scores:
            return top_movies, [1.0] * len(top_movies)
        return top_movies

    # OTIMIZA√á√ÉO: Acumular scores apenas dos filmes similares aos que o usu√°rio avaliou
    pred_scores = defaultdict(float)
    score_counts = defaultdict(int)

    for rated_idx in rated_movie_indices:
        user_rating = user_ratings[rated_idx]

        # Pegar filmes similares a este que o usu√°rio avaliou
        if rated_idx not in item_similarity_topk:
            continue

        similar_indices = item_similarity_topk[rated_idx]['indices']
        similar_scores = item_similarity_topk[rated_idx]['scores']

        # Acumular scores ponderados
        for sim_idx, sim_score in zip(similar_indices, similar_scores):
            # Pular o pr√≥prio filme
            if sim_idx == rated_idx:
                continue

            movie_id = idx_to_movie[sim_idx]
            pred_scores[movie_id] += user_rating * sim_score
            score_counts[movie_id] += 1

    # Normalizar scores pela contagem (m√©dia)
    for movie_id in pred_scores:
        if score_counts[movie_id] > 0:
            pred_scores[movie_id] /= score_counts[movie_id]

    # Remover filmes j√° avaliados pelo usu√°rio
    for rated_idx in rated_movie_indices:
        movie_id = idx_to_movie[rated_idx]
        if movie_id in pred_scores:
            del pred_scores[movie_id]

    # Se n√£o conseguiu gerar recomenda√ß√µes, usar popularidade
    if len(pred_scores) == 0:
        popular_movies = train_df['movie_id'].value_counts().head(
            k).index.tolist()
        if return_scores:
            return popular_movies, [1.0] * len(popular_movies)
        return popular_movies

    # Ordenar e pegar top-K
    sorted_recs = sorted(pred_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_movie_ids = [mid for mid, _ in sorted_recs[:k]]

    if return_scores:
        scores = [pred_scores[mid] for mid in recommended_movie_ids]
        return recommended_movie_ids, scores
    return recommended_movie_ids


# === AVALIA√á√ÉO NO TEST SET ===
print("üîÑ Avaliando filtragem colaborativa no test set (pode demorar ~5-10 min)...\n")

test_users = test_df['user_id'].unique()
precisions, recalls, ndcgs = [], [], []
skipped_no_relevant = 0
skipped_not_in_train = 0

for user_id in tqdm(test_users[:1000], desc="Avaliando"):
    # Verificar se usu√°rio est√° no train
    if user_id not in user_to_idx:
        skipped_not_in_train += 1
        continue

    # Ground truth: filmes que o usu√°rio gostou no TEST
    user_test = test_df[test_df['user_id'] == user_id]
    relevant_items = set(
        user_test[user_test['rating'] >= MIN_RATING_THRESHOLD]['movie_id'].values)

    if len(relevant_items) == 0:
        skipped_no_relevant += 1
        continue

    # Gerar recomenda√ß√µes
    try:
        recs = recommend_collaborative(user_id, k=K)
    except Exception as e:
        print(f"Erro no usu√°rio {user_id}: {e}")
        continue

    # Precision@K
    hits = len(set(recs) & relevant_items)
    precision = hits / K
    precisions.append(precision)

    # Recall@K
    recall = hits / len(relevant_items)
    recalls.append(recall)

    # NDCG@K
    relevance = [1 if mid in relevant_items else 0 for mid in recs]
    ideal_relevance = sorted(relevance, reverse=True)
    if sum(ideal_relevance) > 0:
        ndcg = ndcg_score([ideal_relevance], [relevance])
        ndcgs.append(ndcg)

print(f"\nüìä Filtragem Colaborativa - M√©tricas:")
print(
    f"   - Precision@{K}: {np.mean(precisions):.3f} ¬± {np.std(precisions):.3f}")
print(f"   - Recall@{K}: {np.mean(recalls):.3f} ¬± {np.std(recalls):.3f}")
print(f"   - NDCG@{K}: {np.mean(ndcgs):.3f} ¬± {np.std(ndcgs):.3f}")
print(f"\n   ‚ÑπÔ∏è  Usu√°rios avaliados: {len(precisions)}")
print(f"   ‚ö†Ô∏è  Pulados (n√£o no train): {skipped_not_in_train}")
print(f"   ‚ö†Ô∏è  Pulados (sem relevantes): {skipped_no_relevant}")

# Salvar m√©tricas
metrics_collaborative = {
    'precision': (np.mean(precisions), np.std(precisions)),
    'recall': (np.mean(recalls), np.std(recalls)),
    'ndcg': (np.mean(ndcgs), np.std(ndcgs))
}

with open(CACHE_DIR / 'metrics_collaborative.pkl', 'wb') as f:
    pickle.dump(metrics_collaborative, f)

print(f"\n‚úÖ M√©tricas salvas em: {CACHE_DIR / 'metrics_collaborative.pkl'}")

### Fase 3: Abordagem 2 - Filtragem Baseada em Conte√∫do

In [None]:
# C√©lula 8: Gerar Embeddings de Filmes (Mini-batches)

embeddings_cache = CACHE_DIR / 'movie_embeddings.pkl'

if embeddings_cache.exists():
    print("üìÇ Carregando embeddings do cache...")
    with open(embeddings_cache, 'rb') as f:
        movie_embeddings = pickle.load(f)
else:
    print(f"üîÑ Gerando embeddings com {EMBEDDING_MODEL} (pode demorar ~15-20 min)...")
    
    model = SentenceTransformer(EMBEDDING_MODEL)
    
    # Processar em mini-batches (economia de RAM)
    all_embeddings = []
    texts = movies_df['text'].tolist()
    
    for start_idx in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE), desc="Batches"):
        end_idx = min(start_idx + EMBEDDING_BATCH_SIZE, len(texts))
        batch_texts = texts[start_idx:end_idx]
        
        batch_emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
        all_embeddings.append(batch_emb)
        
        # Liberar mem√≥ria a cada 1000 filmes
        if (start_idx + EMBEDDING_BATCH_SIZE) % 1000 == 0:
            gc.collect()
    
    movie_embeddings = np.vstack(all_embeddings)
    
    # Cache
    with open(embeddings_cache, 'wb') as f:
        pickle.dump(movie_embeddings, f)
    print(f"üíæ Embeddings salvos em cache: {embeddings_cache}")

print(f"\n‚úÖ Embeddings gerados: {movie_embeddings.shape}")
print(f"   - Mem√≥ria: ~{movie_embeddings.nbytes / 1024**2:.2f} MB")

# Criar mapeamento movie_id ‚Üí embedding index
movie_id_to_emb_idx = {row['movie_id']: idx for idx, row in movies_df.iterrows()}

In [None]:
# C√©lula 9: Construir Perfis de Usu√°rios e Implementar Recomenda√ß√£o

def build_user_profile_content(user_ratings_dict):
    """
    Constr√≥i perfil do usu√°rio como m√©dia ponderada dos embeddings
    
    Args:
        user_ratings_dict: {movie_id: rating}
    
    Returns:
        np.array: Perfil do usu√°rio (embedding m√©dio ponderado)
    """
    weighted_embeddings = []
    
    for movie_id, rating in user_ratings_dict.items():
        if movie_id in movie_id_to_emb_idx:
            emb_idx = movie_id_to_emb_idx[movie_id]
            weighted_embeddings.append(rating * movie_embeddings[emb_idx])
    
    if len(weighted_embeddings) == 0:
        # Retornar embedding m√©dio de todos os filmes
        return np.mean(movie_embeddings, axis=0)
    
    return np.mean(weighted_embeddings, axis=0)


def recommend_content(user_id, k=10, return_scores=False):
    """
    Recomenda filmes usando filtragem baseada em conte√∫do
    
    Args:
        user_id: ID do usu√°rio
        k: N√∫mero de recomenda√ß√µes
        return_scores: Se True, retorna (movie_ids, scores)
    
    Returns:
        list: IDs dos filmes recomendados (ou tuple se return_scores=True)
    """
    # Obter ratings do usu√°rio
    user_data = train_df[train_df['user_id'] == user_id]
    
    if len(user_data) == 0:
        # Usu√°rio novo: retornar filmes populares de g√™neros diversos
        top_movies = train_df['movie_id'].value_counts().head(k).index.tolist()
        if return_scores:
            return top_movies, [1.0] * len(top_movies)
        return top_movies
    
    user_ratings_dict = dict(zip(user_data['movie_id'], user_data['rating']))
    
    # Construir perfil
    user_profile = build_user_profile_content(user_ratings_dict)
    
    # Calcular similaridade com todos os filmes
    similarities = cosine_similarity([user_profile], movie_embeddings)[0]
    
    # Remover filmes j√° avaliados
    for movie_id in user_ratings_dict.keys():
        if movie_id in movie_id_to_emb_idx:
            emb_idx = movie_id_to_emb_idx[movie_id]
            similarities[emb_idx] = -np.inf
    
    # Top-K recomenda√ß√µes
    top_k_indices = np.argsort(similarities)[::-1][:k]
    recommended_movie_ids = [movies_df.iloc[idx]['movie_id'] for idx in top_k_indices]
    
    if return_scores:
        return recommended_movie_ids, similarities[top_k_indices]
    return recommended_movie_ids


print("‚úÖ Fun√ß√µes de recomenda√ß√£o baseada em conte√∫do definidas!")

In [None]:
# C√©lula 10 OTIMIZADA: Avaliar Filtragem Baseada em Conte√∫do

print("üîÑ Avaliando filtragem baseada em conte√∫do no test set...\n")

precisions_content, recalls_content, ndcgs_content = [], [], []
skipped_no_relevant = 0
skipped_not_in_train = 0

for user_id in tqdm(test_users[:1000], desc="Avaliando"):
    # Verificar se usu√°rio est√° no train
    user_data = train_df[train_df['user_id'] == user_id]
    if len(user_data) == 0:
        skipped_not_in_train += 1
        continue

    # Ground truth: filmes que o usu√°rio gostou no TEST
    user_test = test_df[test_df['user_id'] == user_id]
    relevant_items = set(
        user_test[user_test['rating'] >= MIN_RATING_THRESHOLD]['movie_id'].values)

    if len(relevant_items) == 0:
        skipped_no_relevant += 1
        continue

    # Recomendar
    try:
        recs = recommend_content(user_id, k=K)
    except Exception as e:
        print(f"Erro no usu√°rio {user_id}: {e}")
        continue

    # M√©tricas
    hits = len(set(recs) & relevant_items)
    precision = hits / K
    precisions_content.append(precision)

    recall = hits / len(relevant_items)
    recalls_content.append(recall)

    relevance = [1 if mid in relevant_items else 0 for mid in recs]
    ideal_relevance = sorted(relevance, reverse=True)
    if sum(ideal_relevance) > 0:
        ndcg = ndcg_score([ideal_relevance], [relevance])
        ndcgs_content.append(ndcg)

print(f"\nüìä Filtragem Baseada em Conte√∫do - M√©tricas:")
print(
    f"   - Precision@{K}: {np.mean(precisions_content):.3f} ¬± {np.std(precisions_content):.3f}")
print(
    f"   - Recall@{K}: {np.mean(recalls_content):.3f} ¬± {np.std(recalls_content):.3f}")
print(
    f"   - NDCG@{K}: {np.mean(ndcgs_content):.3f} ¬± {np.std(ndcgs_content):.3f}")
print(f"\n   ‚ÑπÔ∏è  Usu√°rios avaliados: {len(precisions_content)}")
print(f"   ‚ö†Ô∏è  Pulados (n√£o no train): {skipped_not_in_train}")
print(f"   ‚ö†Ô∏è  Pulados (sem relevantes): {skipped_no_relevant}")

# Salvar m√©tricas
metrics_content = {
    'precision': (np.mean(precisions_content), np.std(precisions_content)),
    'recall': (np.mean(recalls_content), np.std(recalls_content)),
    'ndcg': (np.mean(ndcgs_content), np.std(ndcgs_content))
}

with open(CACHE_DIR / 'metrics_content.pkl', 'wb') as f:
    pickle.dump(metrics_content, f)

print(f"\n‚úÖ M√©tricas salvas em: {CACHE_DIR / 'metrics_content.pkl'}")

### Fase 4: Abordagem 3 - Sistema H√≠brido Adaptativo

In [None]:
# C√©lula 11: Implementar Sistema H√≠brido com Œ± Adaptativo

def calculate_adaptive_alpha(user_id):
    """
    Calcula Œ± adaptativo baseado no n√∫mero de ratings do usu√°rio

    Œ± = min(0.9, 0.3 + 0.6 * (num_ratings / ALPHA_MAX_RATINGS))

    - Novo (< 5 ratings): Œ± ‚âà 0.3-0.4 ‚Üí favorece conte√∫do
    - Casual (10-20 ratings): Œ± ‚âà 0.4-0.5 ‚Üí balanceado
    - Ativo (50+ ratings): Œ± ‚âà 0.9 ‚Üí favorece colaborativo
    """
    user_data = train_df[train_df['user_id'] == user_id]
    num_ratings = len(user_data)

    if num_ratings < ALPHA_MIN_RATINGS:
        return 0.3  # Muito novo, confia s√≥ no conte√∫do

    alpha = 0.3 + 0.6 * (num_ratings / ALPHA_MAX_RATINGS)
    return min(0.9, alpha)


def recommend_hybrid(user_id, k=10, alpha='adaptive', return_scores=False):
    """
    Recomenda filmes usando sistema h√≠brido

    Args:
        user_id: ID do usu√°rio
        k: N√∫mero de recomenda√ß√µes
        alpha: float ou 'adaptive' (padr√£o)
        return_scores: Se True, retorna (movie_ids, scores)

    Returns:
        list: IDs dos filmes recomendados (ou tuple se return_scores=True)
    """
    # Determinar Œ±
    if alpha == 'adaptive':
        alpha_value = calculate_adaptive_alpha(user_id)
    else:
        alpha_value = float(alpha)

    # Obter recomenda√ß√µes das duas abordagens
    recs_colab, scores_colab = recommend_collaborative(
        user_id, k=50, return_scores=True)
    recs_content, scores_content = recommend_content(
        user_id, k=50, return_scores=True)

    # Criar dicion√°rio unificado de scores
    all_movie_ids = set(recs_colab) | set(recs_content)
    combined_scores = {}

    # Normalizar scores para [0, 1]
    scores_colab_array = np.array(scores_colab)
    scores_content_array = np.array(scores_content)

    if scores_colab_array.max() > scores_colab_array.min():
        scores_colab_norm = (scores_colab_array - scores_colab_array.min()) / \
            (scores_colab_array.max() - scores_colab_array.min())
    else:
        scores_colab_norm = np.ones_like(scores_colab_array)

    if scores_content_array.max() > scores_content_array.min():
        scores_content_norm = (scores_content_array - scores_content_array.min()) / \
            (scores_content_array.max() - scores_content_array.min())
    else:
        scores_content_norm = np.ones_like(scores_content_array)

    # Mapear scores normalizados
    colab_dict = {mid: score for mid, score in zip(
        recs_colab, scores_colab_norm)}
    content_dict = {mid: score for mid, score in zip(
        recs_content, scores_content_norm)}

    # Combinar scores
    for movie_id in all_movie_ids:
        score_c = colab_dict.get(movie_id, 0)
        score_ct = content_dict.get(movie_id, 0)
        combined_scores[movie_id] = alpha_value * \
            score_c + (1 - alpha_value) * score_ct

    # Top-K
    sorted_movies = sorted(combined_scores.items(),
                        key=lambda x: x[1], reverse=True)
    recommended_movie_ids = [mid for mid, _ in sorted_movies[:k]]

    if return_scores:
        scores = [combined_scores[mid] for mid in recommended_movie_ids]
        return recommended_movie_ids, scores
    return recommended_movie_ids


print("‚úÖ Sistema h√≠brido implementado!")

In [None]:
# C√©lula 12 OTIMIZADA: Avaliar Sistema H√≠brido

print("üîÑ Avaliando sistema h√≠brido no test set...\n")

precisions_hybrid, recalls_hybrid, ndcgs_hybrid = [], [], []
skipped_no_relevant = 0
skipped_not_in_train = 0

for user_id in tqdm(test_users[:1000], desc="Avaliando"):
    # Verificar se usu√°rio est√° no train
    if user_id not in user_to_idx:
        skipped_not_in_train += 1
        continue

    # Ground truth: filmes que o usu√°rio gostou no TEST
    user_test = test_df[test_df['user_id'] == user_id]
    relevant_items = set(
        user_test[user_test['rating'] >= MIN_RATING_THRESHOLD]['movie_id'].values)

    if len(relevant_items) == 0:
        skipped_no_relevant += 1
        continue

    # Recomendar com sistema h√≠brido adaptativo
    try:
        recs = recommend_hybrid(user_id, k=K, alpha='adaptive')
    except Exception as e:
        print(f"Erro no usu√°rio {user_id}: {e}")
        continue

    hits = len(set(recs) & relevant_items)
    precision = hits / K
    precisions_hybrid.append(precision)

    recall = hits / len(relevant_items)
    recalls_hybrid.append(recall)

    relevance = [1 if mid in relevant_items else 0 for mid in recs]
    ideal_relevance = sorted(relevance, reverse=True)
    if sum(ideal_relevance) > 0:
        ndcg = ndcg_score([ideal_relevance], [relevance])
        ndcgs_hybrid.append(ndcg)

print(f"\nüìä Sistema H√≠brido - M√©tricas:")
print(
    f"   - Precision@{K}: {np.mean(precisions_hybrid):.3f} ¬± {np.std(precisions_hybrid):.3f}")
print(
    f"   - Recall@{K}: {np.mean(recalls_hybrid):.3f} ¬± {np.std(recalls_hybrid):.3f}")
print(
    f"   - NDCG@{K}: {np.mean(ndcgs_hybrid):.3f} ¬± {np.std(ndcgs_hybrid):.3f}")
print(f"\n   ‚ÑπÔ∏è  Usu√°rios avaliados: {len(precisions_hybrid)}")
print(f"   ‚ö†Ô∏è  Pulados (n√£o no train): {skipped_not_in_train}")
print(f"   ‚ö†Ô∏è  Pulados (sem relevantes): {skipped_no_relevant}")

metrics_hybrid = {
    'precision': (np.mean(precisions_hybrid), np.std(precisions_hybrid)),
    'recall': (np.mean(recalls_hybrid), np.std(recalls_hybrid)),
    'ndcg': (np.mean(ndcgs_hybrid), np.std(ndcgs_hybrid))
}

with open(CACHE_DIR / 'metrics_hybrid.pkl', 'wb') as f:
    pickle.dump(metrics_hybrid, f)

print(f"\n‚úÖ M√©tricas salvas em: {CACHE_DIR / 'metrics_hybrid.pkl'}")

In [None]:
# C√©lula 13: An√°lise de Cold-Start (Performance vs N¬∫ de Ratings)

print("üîÑ Analisando performance em diferentes n√≠veis de experi√™ncia do usu√°rio...\n")

# Agrupar usu√°rios por n√∫mero de ratings
rating_buckets = {
    '1-5': (1, 5),
    '6-10': (6, 10),
    '11-20': (11, 20),
    '21-50': (21, 50),
    '50+': (51, np.inf)
}

results_by_bucket = {
    'Colaborativo': defaultdict(list),
    'Conte√∫do': defaultdict(list),
    'H√≠brido': defaultdict(list)
}

for user_id in tqdm(test_users[:500], desc="An√°lise Cold-Start"):  # Amostra menor
    user_test = test_df[test_df['user_id'] == user_id]
    relevant_items = set(
        user_test[user_test['rating'] >= MIN_RATING_THRESHOLD]['movie_id'].values)

    if len(relevant_items) == 0:
        continue

    # Determinar bucket
    num_ratings = len(train_df[train_df['user_id'] == user_id])
    bucket_name = None
    for name, (min_r, max_r) in rating_buckets.items():
        if min_r <= num_ratings <= max_r:
            bucket_name = name
            break

    if bucket_name is None:
        continue

    # Avaliar 3 m√©todos
    for method_name, method_func in [
        ('Colaborativo', recommend_collaborative),
        ('Conte√∫do', recommend_content),
        ('H√≠brido', recommend_hybrid)
    ]:
        recs = method_func(user_id, k=K)
        hits = len(set(recs) & relevant_items)
        precision = hits / K
        results_by_bucket[method_name][bucket_name].append(precision)

# Calcular m√©dias
avg_results = {method: {} for method in results_by_bucket.keys()}
for method in results_by_bucket:
    for bucket in rating_buckets.keys():
        if bucket in results_by_bucket[method] and len(results_by_bucket[method][bucket]) > 0:
            avg_results[method][bucket] = np.mean(
                results_by_bucket[method][bucket])
        else:
            avg_results[method][bucket] = 0

# Visualizar
fig, ax = plt.subplots(figsize=(12, 6))

x_pos = np.arange(len(rating_buckets))
width = 0.25

for i, (method, color) in enumerate([
    ('Colaborativo', 'steelblue'),
    ('Conte√∫do', 'coral'),
    ('H√≠brido', 'mediumseagreen')
]):
    values = [avg_results[method][bucket] for bucket in rating_buckets.keys()]
    ax.bar(x_pos + i*width, values, width,
        label=method, color=color, alpha=0.8)

ax.set_xlabel('N¬∫ de Ratings do Usu√°rio', fontsize=12)
ax.set_ylabel(f'Precision@{K}', fontsize=12)
ax.set_title('An√°lise de Cold-Start: Performance vs Experi√™ncia do Usu√°rio',
            fontsize=14, fontweight='bold')
ax.set_xticks(x_pos + width)
ax.set_xticklabels(rating_buckets.keys())
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'cold_start_analysis.png',
            dpi=150, bbox_inches='tight')
plt.show()

print(
    f"\n‚úÖ An√°lise de cold-start salva em: {RESULTS_DIR / 'cold_start_analysis.png'}")

### Fase 5: Visualiza√ß√µes Comparativas

In [None]:
# C√©lula 14: T-SNE - Visualiza√ß√£o de Embeddings por G√™nero

print("üîÑ Aplicando T-SNE nos embeddings (pode demorar ~3-5 min)...\n")

tsne_cache = CACHE_DIR / 'tsne_2d.pkl'

if tsne_cache.exists():
    print("üìÇ Carregando T-SNE do cache...")
    with open(tsne_cache, 'rb') as f:
        data = pickle.load(f)
        embeddings_2d = data['embeddings_2d']
        sample_indices = data['sample_indices']
else:
    # Usar amostra para velocidade (1000 filmes)
    sample_size = min(1000, len(movie_embeddings))
    sample_indices = np.random.choice(
        len(movie_embeddings), sample_size, replace=False)

    # CORRE√á√ÉO: n_iter ‚Üí max_iter (sklearn >= 1.0)
    tsne = TSNE(n_components=2, perplexity=30, max_iter=1000,
                random_state=RANDOM_STATE, verbose=1)
    embeddings_2d = tsne.fit_transform(movie_embeddings[sample_indices])

    with open(tsne_cache, 'wb') as f:
        pickle.dump({'embeddings_2d': embeddings_2d,
                    'sample_indices': sample_indices}, f)
    print(f"üíæ T-SNE salvo em cache: {tsne_cache}")

# Extrair g√™nero principal de cada filme
sample_movies = movies_df.iloc[sample_indices]
primary_genres = sample_movies['genres'].str.split('|').str[0]

# Top-5 g√™neros mais comuns
top_genres = primary_genres.value_counts().head(5).index.tolist()
genre_colors = {
    'Drama': '#FF6B6B',
    'Comedy': '#4ECDC4',
    'Action': '#45B7D1',
    'Thriller': '#FFA07A',
    'Romance': '#DDA0DD',
    'Sci-Fi': '#98D8C8',
    'Horror': '#F7DC6F'
}

# Visualizar
fig, ax = plt.subplots(figsize=(14, 10))

for genre in top_genres:
    mask = primary_genres == genre
    color = genre_colors.get(genre, '#CCCCCC')
    ax.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1],
            c=color, label=genre, alpha=0.6, s=50, edgecolors='black', linewidth=0.3)

# Outros g√™neros em cinza
mask_others = ~primary_genres.isin(top_genres)
ax.scatter(embeddings_2d[mask_others, 0], embeddings_2d[mask_others, 1],
        c='#CCCCCC', label='Outros', alpha=0.3, s=30)

ax.set_title('T-SNE: Embeddings de Filmes Agrupados por G√™nero',
            fontsize=16, fontweight='bold')
ax.set_xlabel('Dimens√£o 1', fontsize=12)
ax.set_ylabel('Dimens√£o 2', fontsize=12)
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.2)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'tsne_movies_by_genre.png',
            dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ T-SNE salvo em: {RESULTS_DIR / 'tsne_movies_by_genre.png'}")

In [None]:
# C√©lula 15: Compara√ß√£o Gr√°fica das 3 Abordagens

# Carregar m√©tricas
with open(CACHE_DIR / 'metrics_collaborative.pkl', 'rb') as f:
    metrics_collaborative = pickle.load(f)
with open(CACHE_DIR / 'metrics_content.pkl', 'rb') as f:
    metrics_content = pickle.load(f)
with open(CACHE_DIR / 'metrics_hybrid.pkl', 'rb') as f:
    metrics_hybrid = pickle.load(f)

# Preparar dados
methods = ['Colaborativo', 'Conte√∫do', 'H√≠brido']
precision_means = [
    metrics_collaborative['precision'][0],
    metrics_content['precision'][0],
    metrics_hybrid['precision'][0]
]
recall_means = [
    metrics_collaborative['recall'][0],
    metrics_content['recall'][0],
    metrics_hybrid['recall'][0]
]
ndcg_means = [
    metrics_collaborative['ndcg'][0],
    metrics_content['ndcg'][0],
    metrics_hybrid['ndcg'][0]
]

# Visualiza√ß√£o
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
colors = ['steelblue', 'coral', 'mediumseagreen']

# Precision@10
ax = axes[0]
bars = ax.bar(methods, precision_means, color=colors,
            alpha=0.8, edgecolor='black')
ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Precision@{K}', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(precision_means) * 1.2)
ax.grid(axis='y', alpha=0.3)
for i, (bar, val) in enumerate(zip(bars, precision_means)):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.01, f'{val:.3f}',
            ha='center', fontsize=11, fontweight='bold')

# Recall@10
ax = axes[1]
bars = ax.bar(methods, recall_means, color=colors,
            alpha=0.8, edgecolor='black')
ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Recall@{K}', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(recall_means) * 1.2)
ax.grid(axis='y', alpha=0.3)
for i, (bar, val) in enumerate(zip(bars, recall_means)):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.01, f'{val:.3f}',
            ha='center', fontsize=11, fontweight='bold')

# NDCG@10
ax = axes[2]
bars = ax.bar(methods, ndcg_means, color=colors, alpha=0.8, edgecolor='black')
ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'NDCG@{K}', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(ndcg_means) * 1.2)
ax.grid(axis='y', alpha=0.3)
for i, (bar, val) in enumerate(zip(bars, ndcg_means)):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.01, f'{val:.3f}',
            ha='center', fontsize=11, fontweight='bold')

plt.suptitle('Compara√ß√£o de Performance: 3 Abordagens de Recomenda√ß√£o',
            fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'metrics_comparison.png',
            dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Compara√ß√£o salva em: {RESULTS_DIR / 'metrics_comparison.png'}")
print("\n" + "="*60)
print("RESUMO DAS M√âTRICAS:")
print("="*60)
for method, p, r, n in zip(methods, precision_means, recall_means, ndcg_means):
    print(f"{method:15} | Precision: {p:.3f} | Recall: {r:.3f} | NDCG: {n:.3f}")
print("="*60)

## PARTE 2: Sistema Interativo com Jupyter Widgets

### Interface de Recomenda√ß√£o Personalizada

In [None]:
# C√©lula 16: Interface de Sele√ß√£o de Filmes

# Preparar lista de t√≠tulos de filmes para autocomplete
all_movie_titles = sorted(movies_df['title'].unique().tolist())
movie_title_to_id = dict(zip(movies_df['title'], movies_df['movie_id']))

# Estado global para armazenar sele√ß√µes do usu√°rio
user_selection_state = {'movies': {}}

print("üé¨ SISTEMA INTERATIVO DE RECOMENDA√á√ÉO DE FILMES")
print("="*60)
print("\nüìù Instru√ß√µes:")
print("1. Selecione 5-10 filmes que voc√™ j√° assistiu")
print("2. D√™ uma nota de 1-5 estrelas para cada filme")
print("3. Use os controles abaixo para personalizar as recomenda√ß√µes")
print("="*60 + "\n")

# Criar widgets de sele√ß√£o de filmes
movie_widgets = []

for i in range(10):
    # Dropdown para selecionar filme
    movie_dropdown = widgets.Dropdown(
        options=[''] + all_movie_titles,
        value='',
        description=f'Filme {i+1}:',
        layout=widgets.Layout(width='500px'),
        style={'description_width': '70px'}
    )

    # Slider para nota
    rating_slider = widgets.IntSlider(
        value=4,
        min=1,
        max=5,
        step=1,
        description='Nota:',
        layout=widgets.Layout(width='300px'),
        style={'description_width': '50px'},
        disabled=True  # Desabilitado at√© selecionar filme
    )

    # Callback para habilitar/desabilitar slider
    def make_on_change_callback(slider, idx):
        def on_change(change):
            if change['new'] != '':
                slider.disabled = False
                # Salvar no estado
                movie_id = movie_title_to_id[change['new']]
                user_selection_state['movies'][idx] = {
                    'title': change['new'],
                    'movie_id': movie_id,
                    'rating': slider.value
                }
            else:
                slider.disabled = True
                if idx in user_selection_state['movies']:
                    del user_selection_state['movies'][idx]
        return on_change

    def make_rating_callback(dropdown, idx):
        def on_rating_change(change):
            if idx in user_selection_state['movies']:
                user_selection_state['movies'][idx]['rating'] = change['new']
        return on_rating_change

    movie_dropdown.observe(make_on_change_callback(
        rating_slider, i), names='value')
    rating_slider.observe(make_rating_callback(
        movie_dropdown, i), names='value')

    movie_widgets.append(widgets.HBox([movie_dropdown, rating_slider]))

# Layout vertical com todos os seletores
movie_selection_box = widgets.VBox(
    movie_widgets,
    layout=widgets.Layout(border='2px solid #ccc',
                        padding='10px', margin='10px')
)

display(widgets.HTML("<h3>üìΩÔ∏è Selecione os Filmes que Voc√™ J√° Assistiu:</h3>"))
display(movie_selection_box)

In [None]:
# C√©lula 17: Controles do Sistema de Recomenda√ß√£o

# M√©todo de recomenda√ß√£o
method_selector = widgets.RadioButtons(
    options=['Colaborativo', 'Conte√∫do', 'H√≠brido'],
    value='H√≠brido',
    description='M√©todo:',
    style={'description_width': '80px'}
)

# Slider para Œ± (apenas para h√≠brido)
alpha_slider = widgets.FloatSlider(
    value=0.7,
    min=0.0,
    max=1.0,
    step=0.05,
    description='Œ± (Colab):',
    readout_format='.2f',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='400px')
)

alpha_label = widgets.HTML(
    value="<i>Œ± = 0: s√≥ conte√∫do | Œ± = 1: s√≥ colaborativo | Œ± = adaptativo (recomendado)</i>"
)

# Checkbox para usar Œ± adaptativo
adaptive_alpha_checkbox = widgets.Checkbox(
    value=True,
    description='Usar Œ± adaptativo',
    indent=False
)


def on_adaptive_change(change):
    alpha_slider.disabled = change['new']


adaptive_alpha_checkbox.observe(on_adaptive_change, names='value')
alpha_slider.disabled = True  # Inicialmente desabilitado

# Slider para n√∫mero de recomenda√ß√µes
k_slider = widgets.IntSlider(
    value=10,
    min=5,
    max=20,
    step=1,
    description='Top-K:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='400px')
)

# Organizar controles
controls_box = widgets.VBox([
    widgets.HTML("<h3>‚öôÔ∏è Configura√ß√µes de Recomenda√ß√£o:</h3>"),
    method_selector,
    widgets.HTML("<br><b>Par√¢metros do H√≠brido:</b>"),
    adaptive_alpha_checkbox,
    alpha_slider,
    alpha_label,
    widgets.HTML("<br><b>N√∫mero de Recomenda√ß√µes:</b>"),
    k_slider
], layout=widgets.Layout(border='2px solid #ccc', padding='10px', margin='10px'))

display(controls_box)

In [None]:
# C√©lula 18: Gera√ß√£o de Recomenda√ß√µes Interativa

# Fun√ß√µes auxiliares para recomenda√ß√£o interativa
def create_temp_user_profile():
    """Cria perfil tempor√°rio do usu√°rio baseado nas sele√ß√µes"""
    user_ratings = {}
    for data in user_selection_state['movies'].values():
        user_ratings[data['movie_id']] = data['rating']
    return user_ratings


def recommend_interactive(method, k, alpha_val):
    """Gera recomenda√ß√µes baseadas nas sele√ß√µes do usu√°rio"""
    user_ratings = create_temp_user_profile()

    if len(user_ratings) == 0:
        return [], []

    if method == 'Colaborativo':
        # Simular usu√°rio tempor√°rio
        return recommend_collab_interactive(user_ratings, k)
    elif method == 'Conte√∫do':
        return recommend_content_interactive(user_ratings, k)
    else:  # H√≠brido
        return recommend_hybrid_interactive(user_ratings, k, alpha_val)


def recommend_collab_interactive(user_ratings, k):
    """Colaborativo para usu√°rio tempor√°rio"""
    # Usar filmes avaliados para encontrar similares
    all_scores = defaultdict(float)

    for movie_id, rating in user_ratings.items():
        if movie_id not in movie_to_idx:
            continue
        movie_idx = movie_to_idx[movie_id]

        if movie_idx in item_similarity_topk:
            similar_indices = item_similarity_topk[movie_idx]['indices']
            similar_scores = item_similarity_topk[movie_idx]['scores']

            for sim_idx, sim_score in zip(similar_indices, similar_scores):
                sim_movie_id = idx_to_movie[sim_idx]
                if sim_movie_id not in user_ratings:
                    all_scores[sim_movie_id] += rating * sim_score

    sorted_recs = sorted(all_scores.items(), key=lambda x: x[1], reverse=True)
    return [mid for mid, _ in sorted_recs[:k]], [score for _, score in sorted_recs[:k]]


def recommend_content_interactive(user_ratings, k):
    """Conte√∫do para usu√°rio tempor√°rio"""
    user_profile = build_user_profile_content(user_ratings)
    similarities = cosine_similarity([user_profile], movie_embeddings)[0]

    # Remover filmes j√° avaliados
    for movie_id in user_ratings.keys():
        if movie_id in movie_id_to_emb_idx:
            emb_idx = movie_id_to_emb_idx[movie_id]
            similarities[emb_idx] = -np.inf

    top_k_indices = np.argsort(similarities)[::-1][:k]
    recommended_movie_ids = [movies_df.iloc[idx]['movie_id']
                            for idx in top_k_indices]

    return recommended_movie_ids, similarities[top_k_indices]


def recommend_hybrid_interactive(user_ratings, k, alpha_val):
    """H√≠brido para usu√°rio tempor√°rio"""
    recs_colab, scores_colab = recommend_collab_interactive(user_ratings, k=50)
    recs_content, scores_content = recommend_content_interactive(
        user_ratings, k=50)

    all_movie_ids = set(recs_colab) | set(recs_content)
    combined_scores = {}

    # Normalizar
    scores_colab_array = np.array(scores_colab) if len(
        scores_colab) > 0 else np.array([])
    scores_content_array = np.array(scores_content) if len(
        scores_content) > 0 else np.array([])

    if len(scores_colab_array) > 0 and scores_colab_array.max() > scores_colab_array.min():
        scores_colab_norm = (scores_colab_array - scores_colab_array.min()) / \
            (scores_colab_array.max() - scores_colab_array.min())
    else:
        scores_colab_norm = np.ones_like(scores_colab_array) if len(
            scores_colab_array) > 0 else np.array([])

    if len(scores_content_array) > 0 and scores_content_array.max() > scores_content_array.min():
        scores_content_norm = (scores_content_array - scores_content_array.min()) / \
            (scores_content_array.max() - scores_content_array.min())
    else:
        scores_content_norm = np.ones_like(scores_content_array) if len(
            scores_content_array) > 0 else np.array([])

    colab_dict = {mid: score for mid, score in zip(
        recs_colab, scores_colab_norm)}
    content_dict = {mid: score for mid, score in zip(
        recs_content, scores_content_norm)}

    for movie_id in all_movie_ids:
        score_c = colab_dict.get(movie_id, 0)
        score_ct = content_dict.get(movie_id, 0)
        combined_scores[movie_id] = alpha_val * \
            score_c + (1 - alpha_val) * score_ct

    sorted_movies = sorted(combined_scores.items(),
                        key=lambda x: x[1], reverse=True)
    return [mid for mid, _ in sorted_movies[:k]], [combined_scores[mid] for mid, _ in sorted_movies[:k]]


def format_recommendations_html(recs, scores, method_name):
    """Formata recomenda√ß√µes em HTML"""
    html = f"<h2>üéØ Recomenda√ß√µes - M√©todo: {method_name}</h2>"
    html += "<table style='border-collapse: collapse; width: 100%;'>"
    html += "<tr style='background-color: #f0f0f0; font-weight: bold;'>"
    html += "<th style='border: 1px solid #ddd; padding: 8px;'>Rank</th>"
    html += "<th style='border: 1px solid #ddd; padding: 8px;'>T√≠tulo</th>"
    html += "<th style='border: 1px solid #ddd; padding: 8px;'>G√™neros</th>"
    html += "<th style='border: 1px solid #ddd; padding: 8px;'>Score</th>"
    html += "</tr>"

    for i, (movie_id, score) in enumerate(zip(recs, scores), 1):
        movie_info = movies_df[movies_df['movie_id'] == movie_id].iloc[0]
        title = movie_info['title']
        genres = movie_info['genres']

        html += f"<tr style='background-color: {'#ffffff' if i % 2 == 0 else '#f9f9f9'};'>"
        html += f"<td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{i}</td>"
        html += f"<td style='border: 1px solid #ddd; padding: 8px;'><b>{title}</b></td>"
        html += f"<td style='border: 1px solid #ddd; padding: 8px;'><i>{genres}</i></td>"
        html += f"<td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{score:.3f}</td>"
        html += "</tr>"

    html += "</table>"
    return html


# Output widget
output_recs = widgets.Output()

# Bot√£o de gera√ß√£o


def on_generate_click(b):
    with output_recs:
        output_recs.clear_output()

        # Validar sele√ß√µes
        num_selected = len(user_selection_state['movies'])
        if num_selected < 3:
            print("‚ö†Ô∏è Por favor, selecione pelo menos 3 filmes!")
            return

        # Obter configura√ß√µes
        method = method_selector.value
        k = k_slider.value
        alpha_val = 'adaptive' if adaptive_alpha_checkbox.value else alpha_slider.value

        # Se h√≠brido com adaptativo, calcular Œ± baseado no n√∫mero de filmes
        if method == 'H√≠brido' and alpha_val == 'adaptive':
            alpha_val = min(0.9, 0.3 + 0.6 * (num_selected / 10))
            print(
                f"‚ÑπÔ∏è Œ± adaptativo calculado: {alpha_val:.2f} (baseado em {num_selected} filmes)")

        # Gerar recomenda√ß√µes
        print(f"üîÑ Gerando recomenda√ß√µes com m√©todo '{method}'...\n")
        recs, scores = recommend_interactive(method, k, alpha_val)

        if len(recs) == 0:
            print("‚ùå N√£o foi poss√≠vel gerar recomenda√ß√µes. Tente selecionar mais filmes.")
            return

        # Exibir
        display(HTML(format_recommendations_html(recs, scores, method)))


generate_btn = widgets.Button(
    description='üé¨ Gerar Recomenda√ß√µes',
    button_style='success',
    layout=widgets.Layout(width='300px', height='50px')
)
generate_btn.on_click(on_generate_click)

# Layout
display(widgets.HTML("<h3>üöÄ Gerar Recomenda√ß√µes:</h3>"))
display(generate_btn)
display(output_recs)

In [None]:
# C√©lula 19: Compara√ß√£o em Tabs (3 M√©todos Lado a Lado)

output_tab1 = widgets.Output()
output_tab2 = widgets.Output()
output_tab3 = widgets.Output()

tab = widgets.Tab()
tab.children = [output_tab1, output_tab2, output_tab3]
tab.titles = ['Colaborativo', 'Conte√∫do', 'H√≠brido']


def on_compare_click(b):
    num_selected = len(user_selection_state['movies'])
    if num_selected < 3:
        print("‚ö†Ô∏è Por favor, selecione pelo menos 3 filmes!")
        return

    k = k_slider.value
    alpha_val = min(0.9, 0.3 + 0.6 * (num_selected / 10))  # Sempre adaptativo

    # Gerar para os 3 m√©todos
    methods_configs = [
        ('Colaborativo', output_tab1, lambda ur,
        k: recommend_collab_interactive(ur, k)),
        ('Conte√∫do', output_tab2, lambda ur,
        k: recommend_content_interactive(ur, k)),
        ('H√≠brido', output_tab3, lambda ur,
        k: recommend_hybrid_interactive(ur, k, alpha_val))
    ]

    user_ratings = create_temp_user_profile()

    for method_name, output_widget, func in methods_configs:
        with output_widget:
            output_widget.clear_output()
            recs, scores = func(user_ratings, k)
            if len(recs) > 0:
                display(HTML(format_recommendations_html(
                    recs, scores, method_name)))
            else:
                print(
                    f"‚ùå N√£o foi poss√≠vel gerar recomenda√ß√µes com {method_name}")


compare_btn = widgets.Button(
    description='üìä Comparar os 3 M√©todos',
    button_style='info',
    layout=widgets.Layout(width='300px', height='50px')
)
compare_btn.on_click(on_compare_click)

display(widgets.HTML("<h3>üìä Compara√ß√£o dos 3 M√©todos:</h3>"))
display(compare_btn)
display(tab)

In [None]:
# C√©lula 20: Explora√ß√£o de Filmes Similares

output_explore = widgets.Output()

# Dropdown vazio inicialmente (preenchido ap√≥s gerar recomenda√ß√µes)
explore_dropdown = widgets.Dropdown(
    options=[],
    description='Explorar:',
    layout=widgets.Layout(width='600px'),
    style={'description_width': '70px'}
)


def get_similar_movies_by_content(movie_id, k=5):
    """Encontra filmes similares por conte√∫do"""
    if movie_id not in movie_id_to_emb_idx:
        return []

    emb_idx = movie_id_to_emb_idx[movie_id]
    movie_emb = movie_embeddings[emb_idx]

    similarities = cosine_similarity([movie_emb], movie_embeddings)[0]
    similarities[emb_idx] = -np.inf  # Excluir o pr√≥prio filme

    top_k_indices = np.argsort(similarities)[::-1][:k]

    similar_movies = []
    for idx in top_k_indices:
        similar_movie_id = movies_df.iloc[idx]['movie_id']
        similar_title = movies_df.iloc[idx]['title']
        similar_genres = movies_df.iloc[idx]['genres']
        similar_score = similarities[idx]
        similar_movies.append((similar_title, similar_genres, similar_score))

    return similar_movies


def on_explore_change(change):
    if change['new'] == '':
        return

    with output_explore:
        output_explore.clear_output()

        # Obter movie_id do t√≠tulo
        movie_title = change['new']
        movie_id = movie_title_to_id.get(movie_title)

        if movie_id is None:
            print("‚ùå Filme n√£o encontrado")
            return

        # Informa√ß√µes do filme
        movie_info = movies_df[movies_df['movie_id'] == movie_id].iloc[0]

        print(f"üé¨ Filme Selecionado: {movie_info['title']}")
        print(f"üìÇ G√™neros: {movie_info['genres']}")
        print(f"\n{'='*60}")
        print("üîç Filmes Similares por Conte√∫do:")
        print("="*60 + "\n")

        similar = get_similar_movies_by_content(movie_id, k=5)

        for i, (title, genres, score) in enumerate(similar, 1):
            print(f"{i}. {title}")
            print(f"   G√™neros: {genres}")
            print(f"   Similaridade: {score:.3f}\n")


explore_dropdown.observe(on_explore_change, names='value')

# Fun√ß√£o para atualizar dropdown com recomenda√ß√µes geradas


def update_explore_dropdown(recommended_titles):
    explore_dropdown.options = [''] + recommended_titles


# Widget para atualizar ap√≥s gerar recomenda√ß√µes
update_btn = widgets.Button(
    description='üìã Carregar √öltimas Recomenda√ß√µes',
    button_style='warning',
    layout=widgets.Layout(width='300px')
)


def on_update_explore(b):
    # Pegar recomenda√ß√µes da √∫ltima gera√ß√£o
    user_ratings = create_temp_user_profile()
    if len(user_ratings) < 3:
        print("‚ö†Ô∏è Gere recomenda√ß√µes primeiro!")
        return

    recs, _ = recommend_interactive(method_selector.value, 10, 0.7)
    rec_titles = [movies_df[movies_df['movie_id']
                            == mid].iloc[0]['title'] for mid in recs]
    update_explore_dropdown(rec_titles)
    print(f"‚úÖ {len(rec_titles)} filmes carregados para explora√ß√£o!")


update_btn.on_click(on_update_explore)

display(widgets.HTML("<h3>üîç Explorar Filmes Similares:</h3>"))
display(update_btn)
display(explore_dropdown)
display(output_explore)

In [None]:
# C√©lula 21: An√°lise de Perfil do Usu√°rio

output_profile = widgets.Output()


def analyze_user_profile():
    """Analisa o perfil do usu√°rio baseado nas sele√ß√µes"""
    user_ratings = create_temp_user_profile()

    if len(user_ratings) == 0:
        print("‚ö†Ô∏è Selecione alguns filmes primeiro!")
        return

    with output_profile:
        output_profile.clear_output()

        # Coletar informa√ß√µes
        genre_ratings = defaultdict(list)
        all_ratings = []

        for movie_id, rating in user_ratings.items():
            all_ratings.append(rating)
            movie_info = movies_df[movies_df['movie_id'] == movie_id].iloc[0]
            genres = movie_info['genres'].split('|')
            for genre in genres:
                genre_ratings[genre].append(rating)

        # Calcular m√©dias por g√™nero
        genre_avg = {g: np.mean(ratings)
                    for g, ratings in genre_ratings.items()}
        genre_avg_sorted = sorted(
            genre_avg.items(), key=lambda x: x[1], reverse=True)

        # Visualiza√ß√µes
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # 1. G√™neros preferidos
        ax = axes[0]
        top_genres = genre_avg_sorted[:8]
        genres_names = [g for g, _ in top_genres]
        genres_scores = [s for _, s in top_genres]

        colors_palette = plt.cm.viridis(np.linspace(0, 1, len(genres_names)))
        ax.barh(genres_names, genres_scores,
                color=colors_palette, edgecolor='black')
        ax.set_xlabel('Nota M√©dia', fontsize=12)
        ax.set_title('Seus G√™neros Preferidos', fontsize=14, fontweight='bold')
        ax.set_xlim(0, 5.5)
        ax.invert_yaxis()
        ax.grid(axis='x', alpha=0.3)

        # 2. Distribui√ß√£o de notas
        ax = axes[1]
        bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
        ax.hist(all_ratings, bins=bins, color='steelblue',
                edgecolor='black', alpha=0.7)
        ax.set_xlabel('Nota', fontsize=12)
        ax.set_ylabel('Frequ√™ncia', fontsize=12)
        ax.set_title('Distribui√ß√£o das Suas Notas',
                    fontsize=14, fontweight='bold')
        ax.set_xticks([1, 2, 3, 4, 5])
        ax.grid(axis='y', alpha=0.3)

        # 3. Resumo estat√≠stico
        ax = axes[2]
        ax.axis('off')

        stats_text = f"""
        ESTAT√çSTICAS DO PERFIL
        {'='*40}
        
        üìä Filmes Avaliados: {len(user_ratings)}
        
        ‚≠ê Nota M√©dia: {np.mean(all_ratings):.2f}
        üìà Nota M√°xima: {int(np.max(all_ratings))}
        üìâ Nota M√≠nima: {int(np.min(all_ratings))}
        üìè Desvio Padr√£o: {np.std(all_ratings):.2f}
        
        üé≠ G√™neros √önicos: {len(genre_ratings)}
        
        ‚ù§Ô∏è  G√™nero Favorito:
        {genre_avg_sorted[0][0]} ({genre_avg_sorted[0][1]:.2f})
        
        üòê G√™nero Menos Favorito:
        {genre_avg_sorted[-1][0]} ({genre_avg_sorted[-1][1]:.2f})
        """

        ax.text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
                family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

        plt.tight_layout()
        plt.show()

        # Recomenda√ß√£o de Œ±
        num_ratings = len(user_ratings)
        alpha_rec = min(0.9, 0.3 + 0.6 * (num_ratings / 10))

        print("\n" + "="*60)
        print("üí° RECOMENDA√á√ÉO PERSONALIZADA:")
        print("="*60)

        if num_ratings < 5:
            print(f"‚ú® Voc√™ tem poucos filmes avaliados ({num_ratings}).")
            print(
                f"   Recomendamos usar 'Conte√∫do' ou 'H√≠brido' com Œ±={alpha_rec:.2f}")
            print(f"   Isso favorecer√° recomenda√ß√µes baseadas em g√™neros similares.")
        elif num_ratings < 10:
            print(
                f"üëç Voc√™ tem um n√∫mero moderado de avalia√ß√µes ({num_ratings}).")
            print(f"   O sistema 'H√≠brido' com Œ±={alpha_rec:.2f} √© ideal!")
            print(f"   Balanceia bem suas prefer√™ncias com padr√µes da comunidade.")
        else:
            print(f"üåü Excelente! Voc√™ tem {num_ratings} avalia√ß√µes.")
            print(
                f"   O sistema 'H√≠brido' com Œ±={alpha_rec:.2f} aproveitar√° bem seus dados!")
            print(f"   Favorecer√° recomenda√ß√µes colaborativas (sabedoria coletiva).")

        print("="*60)


analyze_btn = widgets.Button(
    description='üìä Analisar Meu Perfil',
    button_style='primary',
    layout=widgets.Layout(width='300px', height='50px')
)
analyze_btn.on_click(lambda b: analyze_user_profile())

display(widgets.HTML("<h3>üë§ An√°lise de Perfil do Usu√°rio:</h3>"))
display(analyze_btn)
display(output_profile)

print("\nüéâ Sistema Interativo Completo! Explore as funcionalidades acima.")