In [1]:
# Instalar o pacote kaggle (se necessário)
!pip install kaggle --quiet
import os
import pandas as pd
from pathlib import Path

# Carregar o arquivo CSV correto
movies = pd.read_csv('movies.csv')
movies.head()


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Unnamed: 0,rank,movie_id,title,year,link,imbd_votes,imbd_rating,certificate,duration,genre,...,director_id,director_name,writer_id,writer_name,storyline,user_id,user_name,review_id,review_title,review_content
0,1,tt0111161,The Shawshank Redemption,1994,https://www.imdb.com/title/tt0111161,2711075,9.3,R,2h 22m,Drama,...,nm0001104,Frank Darabont,"nm0000175,nm0001104","Stephen King,Frank Darabont","Over the course of several years, two convicts...","ur16161013,ur15311310,ur0265899,ur16117882,ur1...","hitchcockthelegend,Sleepin_Dragon,EyeDunno,ale...","rw2284594,rw6606154,rw1221355,rw1822343,rw1288...","Some birds aren't meant to be caged.,An incred...",The Shawshank Redemption is written and direct...
1,2,tt0068646,The Godfather,1972,https://www.imdb.com/title/tt0068646,1882829,9.2,R,2h 55m,"Crime,Drama",...,nm0000338,Francis Ford Coppola,"nm0701374,nm0000338","Mario Puzo,Francis Ford Coppola",The aging patriarch of an organized crime dyna...,"ur24740649,ur86182727,ur15794099,ur15311310,ur...","CalRhys,andrewburgereviews,gogoschka-1,Sleepin...","rw3038370,rw4756923,rw4059579,rw6568526,rw1897...","The Pinnacle Of Flawless Films!,An offer so go...",'The Godfather' is the pinnacle of flawless fi...
2,3,tt0468569,The Dark Knight,2008,https://www.imdb.com/title/tt0468569,2684051,9.0,PG-13,2h 32m,"Action,Crime,Drama",...,nm0634240,Christopher Nolan,"tt0468569,nm0634300,nm0634240,nm0275286,tt0468569","Writers,Jonathan Nolan,Christopher Nolan,David...",When the menace known as the Joker wreaks havo...,"ur87850731,ur1293485,ur129557514,ur12449122,ur...","MrHeraclius,Smells_Like_Cheese,dseferaj,little...","rw5478826,rw1914442,rw6606026,rw1917099,rw5170...","The Dark Knight,The Batman of our dreams! So m...","Confidently directed, dark, brooding, and pack..."
3,4,tt0071562,The Godfather Part II,1974,https://www.imdb.com/title/tt0071562,1285350,9.0,R,3h 22m,"Crime,Drama",...,nm0000338,Francis Ford Coppola,"nm0000338,nm0701374","Francis Ford Coppola,Mario Puzo",The early life and career of Vito Corleone in ...,"ur0176092,ur0688559,ur92260614,ur0200644,ur117...","Nazi_Fighter_David,tfrizzell,umunir-36959,DanB...","rw0135607,rw0135487,rw5049900,rw0135526,rw0135...",Breathtaking in its scope and tragic grandeur....,"Coppola's masterpiece is rivaled only by ""The ..."
4,5,tt0050083,12 Angry Men,1957,https://www.imdb.com/title/tt0050083,800954,9.0,Approved,1h 36m,"Crime,Drama",...,nm0001486,Sidney Lumet,nm0741627,Reginald Rose,The jury in a New York City murder trial is fr...,"ur1318549,ur0643062,ur0688559,ur20552756,ur945...","uds3,tedg,tfrizzell,TheLittleSongbird,henrique...","rw0060044,rw0060025,rw0060034,rw2262425,rw5448...","The over-used term ""classic movie"" really come...",This once-in-a-generation masterpiece simply h...


In [3]:
# Pré-processamento adaptado para o formato do movies.csv
def clean_text(text):
    if pd.isnull(text):
        return ''
    return str(text).replace(',', ' ').replace('.', ' ').replace('-', ' ').replace('/', ' ')

movies['tags'] = movies['genre'].apply(clean_text) + ' ' +\
                  movies['cast_name'].apply(clean_text) + ' ' +\
                  movies['director_name'].apply(clean_text) + ' ' +\
                  movies['storyline'].apply(clean_text)

movies[['title', 'tags']].head()

Unnamed: 0,title,tags
0,The Shawshank Redemption,Drama Tim Robbins Morgan Freeman Bob Gunton Wi...
1,The Godfather,Crime Drama Marlon Brando Al Pacino James Caan...
2,The Dark Knight,Action Crime Drama Christian Bale Heath Ledger...
3,The Godfather Part II,Crime Drama Al Pacino Robert Duvall Diane Keat...
4,12 Angry Men,Crime Drama Martin Balsam John Fiedler Lee J ...


In [4]:
# Vetorização das tags com TF-IDF e cálculo da similaridade de cosseno
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Exemplo: similaridade entre os 5 primeiros filmes
cosine_sim[:5, :5]

array([[1.        , 0.03450403, 0.10033476, 0.04640603, 0.02259071],
       [0.03450403, 1.        , 0.05330861, 0.34467717, 0.05479616],
       [0.10033476, 0.05330861, 1.        , 0.06095107, 0.04122436],
       [0.04640603, 0.34467717, 0.06095107, 1.        , 0.0527629 ],
       [0.02259071, 0.05479616, 0.04122436, 0.0527629 , 1.        ]])

In [5]:
# Função de recomendação baseada em similaridade de cosseno
import numpy as np
import pickle

def recommend(title, movies=movies, cosine_sim=cosine_sim):
    idx = movies[movies['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return []
    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Ignora o próprio filme
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices].tolist()

# Exemplo de uso
recommend('Avatar')

# Salvar matriz de similaridade e títulos para uso no Streamlit
with open('similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)
movies[['title']].to_pickle('movies.pkl')