In [None]:
# 3_Modelo_Conteudo.ipynb  
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from scipy.sparse import hstack

In [2]:
DATA = Path("../data/processed")
movies = pd.read_csv(DATA / "movies_clean.csv")  # já tem one-hot de gêneros

# colunas booleanas de gênero
genre_cols = movies.columns.difference(
    ["movieId", "title", "avg_rating", "genres_list"]  # ajuste se nomes mudarem
)
genre_mat = movies[genre_cols].values.astype(np.float32)

In [3]:
tfidf = TfidfVectorizer(stop_words="english")
title_tfidf = tfidf.fit_transform(movies["title"])
content_mat = hstack([genre_mat, title_tfidf], format="csr")


In [4]:
sim_matrix = cosine_similarity(content_mat, dense_output=False)
# sparse → ocupa bem pouca RAM (~800 MB se dense; ~70 MB se sparse)

def recommend_movies(title, n=5):
    # índice do filme fonte
    idx = movies.index[movies["title"] == title]
    if len(idx) == 0:
        return f"'{title}' não encontrado."
    idx = idx[0]

    # pega os n+1 mais similares (0 é o próprio filme)
    sim_scores = sim_matrix[idx].toarray().ravel()
    similar_idx = np.argsort(sim_scores)[::-1][1 : n + 1]
    return movies.iloc[similar_idx][["title"]]

recommend_movies("Toy Story (1995)", n=5)


Unnamed: 0,title
2599,Toy Story 2 (1999)
8168,Toy Story 3 (2010)
10310,Toy Story 4 (2019)
9920,Moana (2016)
7232,Shrek the Third (2007)
