# 1. Importando

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.metrics.pairwise as pw

# 2. Carregando os dados

In [2]:
filmes = pd.read_csv('../data/Filmes.csv', sep=',')
ratings = pd.read_csv('../data/Ratings.csv', sep=';')
dados = pd.read_csv('../data/Dados.csv')
tags = pd.read_csv('../data/Tags.csv')

In [3]:
# visualização dos dados
print("TABELA FILMES")
display(filmes)
print("\n\n")
print("TABELA RATINGS")
display(ratings)
print("\n\n")
print("TABELA DADOS")
display(dados)
print("\n\n")
print("TABELA TAGS")
display(tags)

TABELA FILMES


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
9732,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017
9733,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017
9734,193585,Flint,Drama,2017
9735,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018





TABELA RATINGS


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0





TABELA DADOS


Unnamed: 0,Name,Year,Stars,Score,Time,Votes,Total,Tags,Directors_Cast,Discription
0,Pulp Fiction,1994,8.9,94.0,154,1871051,$107.93,\nCrime Drama,Quentin Tarantino John Travolta Uma Thurman Sa...,\nThe lives of two mob hitmen a boxer a gangst...
1,The Amazing Spider-Man 2,2014,6.6,53.0,142,425529,$202.85,\nAction Adventure Fantasy,Marc Webb Andrew Garfield Emma Stone Jamie Fox...,\nWhen New York is put under siege by Oscorp i...
2,The Shawshank Redemption,1994,9.3,80.0,142,2409436,$28.34,\nDrama,Frank Darabont Tim Robbins Morgan Freeman Bob ...,\nTwo imprisoned men bond over a number of yea...
3,Star Wars: Episode IV - A New Hope,1977,8.6,90.0,121,1255464,$322.74,\nAction Adventure Fantasy,George Lucas Mark Hamill Harrison Ford Carrie ...,\nLuke Skywalker joins forces with a Jedi Knig...
4,Back to the Future,1985,8.5,87.0,116,1087878,$210.61,\nAdventure Comedy Sci-Fi,Robert Zemeckis Michael J. Fox Christopher Llo...,\nMarty McFly a 17-year-old high school studen...
...,...,...,...,...,...,...,...,...,...,...
9932,Cell,2016,4.4,38.0,98,26188,,\nAction Adventure Horror,Tod Williams John Cusack Samuel L. Jackson Isa...,\nWhen a mysterious cell phone signal causes a...
9933,Geralds Game,2017,6.5,77.0,103,94100,,\nDrama Horror Thriller,Mike Flanagan Carla Gugino Bruce Greenwood Chi...,\nA couple tries to spice up their marriage in...
9934,Liseys Story,2021,6.0,,,1251,,\nDrama Horror Mystery,Julianne Moore Clive Owen Jennifer Jason Leigh...,
9935,John Wick: Chapter 2,2017,7.5,75.0,122,381938,$92.03,\nAction Crime Thriller,Chad Stahelski Keanu Reeves Riccardo Scamarcio...,\nAfter returning to the criminal underworld t...





TABELA TAGS


Unnamed: 0,movieId,tag
0,1,"Owned,imdb top 250,Pixar,Pixar,time travel,chi..."
1,2,"Robin Williams,time travel,fantasy,based on ch..."
2,3,"funny,best friend,duringcreditsstinger,fishing..."
3,4,"based on novel or book,chick flick,divorce,int..."
4,5,"aging,baby,confidence,contraception,daughter,g..."
...,...,...
45251,208813,might like
45252,208933,"black and white,deal with the devil"
45253,209035,"computer animation,Japan,mass behavior,mass sc..."
45254,209037,"chameleon,computer animation,gluttony,humorous..."


# 3. Pré-processamento dos dados

In [4]:
print(filmes['movieId'].dtype)
print(ratings['movieId'].dtype)

int64
int64


In [5]:
# pré-processamento para abordagem 1: Collaborative Filtering
db = filmes.merge(ratings, on='movieId')

# pré-processamento para abordagem 2: Content-Based Filtering
#tratando de tipo para fazer o merge
filmes['movieId'] = filmes['movieId'].apply(lambda x: str(x))
db2 = filmes.merge(dados, left_on='title', right_on='Name', how='left')
db2 = db2.merge(tags, left_on='movieId', right_on='movieId', how='left')
# Separar gêneros e padronizar (remover "|")
db2['genres'] = db2['genres'].str.replace('|', ' ', regex=False)
# nova coluna = concatenacao de outras colunas
db2['infos'] = db2['genres'] + ' ' + db2['Directors_Cast'].fillna('') + ' ' + db2['Discription'].fillna('') + ' ' + db2['tag'].fillna('')

In [6]:
# visualização
display(db)
display(db2)

Unnamed: 0,movieId,title,genres,year,userId,rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5
...,...,...,...,...,...,...
100731,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,4.0
100732,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,3.5
100733,193585,Flint,Drama,2017,184,3.5
100734,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,3.5


Unnamed: 0,movieId,title,genres,year,Name,Year,Stars,Score,Time,Votes,Total,Tags,Directors_Cast,Discription,tag,infos
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,Toy Story,1995,8.3,95.0,81,908794,$191.80,\nAnimation Adventure Comedy,John Lasseter Tom Hanks Tim Allen Don Rickles ...,\nA cowboy doll is profoundly threatened and j...,"Owned,imdb top 250,Pixar,Pixar,time travel,chi...",Adventure Animation Children Comedy Fantasy Jo...
1,2,Jumanji,Adventure Children Fantasy,1995,Jumanji,1995,7.0,39.0,104,316485,$100.48,\nAdventure Comedy Family,Joe Johnston Robin Williams Kirsten Dunst Bonn...,\nWhen two kids find and play a magical board ...,"Robin Williams,time travel,fantasy,based on ch...",Adventure Children Fantasy Joe Johnston Robin ...
2,3,Grumpier Old Men,Comedy Romance,1995,,,,,,,,,,,"funny,best friend,duringcreditsstinger,fishing...","Comedy Romance funny,best friend,duringcredi..."
3,4,Waiting to Exhale,Comedy Drama Romance,1995,Waiting to Exhale,1995,6.0,,124,9911,$67.05,\nComedy Drama Romance,Forest Whitaker Whitney Houston Angela Bassett...,,"based on novel or book,chick flick,divorce,int...",Comedy Drama Romance Forest Whitaker Whitney H...
4,5,Father of the Bride Part II,Comedy,1995,Father of the Bride Part II,1995,6.1,49.0,106,35472,$76.59,\nComedy Family Romance,Charles Shyer Steve Martin Diane Keaton Martin...,\nGeorge Banks must deal not only with the pre...,"aging,baby,confidence,contraception,daughter,g...",Comedy Charles Shyer Steve Martin Diane Keaton...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10030,193581,Black Butler: Book of the Atlantic,Action Animation Comedy Fantasy,2017,,,,,,,,,,,,Action Animation Comedy Fantasy
10031,193583,No Game No Life: Zero,Animation Comedy Fantasy,2017,,,,,,,,,,,,Animation Comedy Fantasy
10032,193585,Flint,Drama,2017,,,,,,,,,,,,Drama
10033,193587,Bungo Stray Dogs: Dead Apple,Action Animation,2018,,,,,,,,,,,anime,Action Animation anime


# 4.1 : Abordagem 1: Colaborative Filtering

Vamos identificar usuários parecidos por meio dos filmes e suas avaliações e vamos recomendar um filme que um dos usuários ainda não tenha assistido para o outro. Vamos usar similaridade dos cossenos, vamos transformar cada filme em um vetor em que a posição do vetor é dada pelas notas dos usuários e comparando um vetor com o outro temos que quanto mais próximos um do outro, mais perto estão e portanto o angulo entre eles é menor e consequentement o cosseno é maior (OBS: Máximo valor do cosseno é 1)

In [7]:
tabela_filmes = pd.pivot_table(db, index='title', columns='userId', values='rating').fillna(0)
tabela_filmes.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# calculo da similaridade dos vetores e monatgem do dataframe
similaridade = pw.cosine_similarity(tabela_filmes)
print(similaridade) #array
rec_df = pd.DataFrame(similaridade, index=tabela_filmes.index, columns=tabela_filmes.index)
rec_df.head()

[[1.         0.         0.         ... 0.32732684 0.         0.        ]
 [0.         1.         0.70710678 ... 0.         0.         0.        ]
 [0.         0.70710678 1.         ... 0.         0.         0.        ]
 ...
 [0.32732684 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté (Freedom for Us)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight,0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,1.0,0.857493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You,0.0,0.0,0.0,0.857493,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4.2 : Abordagem 2: Content-Based Filtering

Avalia mais as caracteristicas de cada filme, similaridade entre filmes, olhando para elenco, diretor, etc e usando analíse de linguagem natural para ver palavras em comuns

In [9]:
#cria objeto TfidVectorizer para fazer análise de ling
vec = TfidfVectorizer()
tfidif = vec.fit_transform(db2['infos'].apply(lambda x: np.str_(x)))
similaridade = cosine_similarity(tfidif)
rec_df2 = pd.DataFrame(similaridade, columns=db2['title'], index=db2['title'])
display(rec_df2)

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Heat,Sabrina,Tom and Huck,Sudden Death,...,Gintama: The Movie,anohana: The Flower We Saw That Day - The Movie,Silver Spoon,Love Live! The School Idol Movie,Jon Stewart Has Left the Building,Black Butler: Book of the Atlantic,No Game No Life: Zero,Flint,Bungo Stray Dogs: Dead Apple,Andrew Dice Clay: Dice Rules
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,1.000000,0.048801,0.012254,0.002454,0.014387,0.007494,0.007550,0.002643,0.043585,0.000931,...,0.104419,0.369668,0.029816,0.410376,0.0,0.287439,0.315058,0.000000,0.216688,0.040801
Jumanji,0.048801,1.000000,0.001719,0.052891,0.069424,0.001327,0.001146,0.024341,0.025515,0.008158,...,0.001093,0.000000,0.000000,0.000000,0.0,0.110368,0.119027,0.000000,0.002529,0.000000
Grumpier Old Men,0.012254,0.001719,1.000000,0.009219,0.048977,0.041478,0.041873,0.013453,0.000000,0.003049,...,0.007017,0.000000,0.022063,0.000000,0.0,0.009783,0.010850,0.000000,0.000000,0.030191
Waiting to Exhale,0.002454,0.052891,0.009219,1.000000,0.013580,0.007871,0.008007,0.156400,0.021910,0.005349,...,0.012308,0.021480,0.072466,0.000000,0.0,0.017160,0.019032,0.049467,0.000000,0.052956
Father of the Bride Part II,0.014387,0.069424,0.048977,0.013580,1.000000,0.001193,0.000840,0.013264,0.012747,0.003436,...,0.015815,0.000000,0.049726,0.000000,0.0,0.054984,0.060981,0.000000,0.000000,0.068045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic,0.287439,0.110368,0.009783,0.017160,0.054984,0.053303,0.054220,0.009018,0.000000,0.040871,...,0.283778,0.565645,0.236805,0.627933,0.0,1.000000,0.901660,0.000000,0.482519,0.324046
No Game No Life: Zero,0.315058,0.119027,0.010850,0.019032,0.060981,0.000000,0.000000,0.010001,0.000000,0.000000,...,0.240350,0.627337,0.262632,0.696420,0.0,0.901660,1.000000,0.000000,0.362987,0.359389
Flint,0.000000,0.000000,0.000000,0.049467,0.000000,0.000000,0.000000,0.012997,0.000000,0.000000,...,0.000000,0.434227,0.682618,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
Bungo Stray Dogs: Dead Apple,0.216688,0.002529,0.000000,0.000000,0.000000,0.044244,0.045006,0.000000,0.000000,0.033925,...,0.432035,0.469515,0.000000,0.521219,0.0,0.482519,0.362987,0.000000,1.000000,0.000000


# 5. Recomendando Filmes

In [25]:
# função para recomendar filmes

def recomendar_filmes():
    while True:
        filme = input("Digite o nome de um filme para obter recomendações (ou '0' para encerrar): ")
        if filme == '0':
            print("Encerrando...")
            break

        filme = titled_movie(filme)
        
        # Verificar existência do filme
        if filme not in rec_df.index:
            print("Filme não encontrado. Tente novamente.")
            continue
        
        print("\nRecomendações - Outros usuários curtiram (Abordagem 1):")
        recomendacoes_1 = rec_df[filme].sort_values(ascending=False).index[1:6]  # Ignorar o próprio filme
        #print(recomendacoes_1.to_list())
        for i in recomendacoes_1.to_list():
            print(i)
        
        print(f"\nRecomendações - Filmes parecidos com {filme} (Abordagem 2):")
        recomendacoes_2 = rec_df2[filme].sort_values(ascending=False).index[1:6]  # Ignorar o próprio filme
        for i in recomendacoes_2.to_list():
            print(i)

def titled_movie(name):
    name=name.split(' ')
    exceptions=['of','the']
    titledName=''
    for i in name:
        if i in exceptions:
            titledName+=i
        else:
            titledName+=i.title()
        titledName+=' '
    return(titledName[:len(titledName)-1])

# Interface interativa
recomendar_filmes()


Recomendações - Outros usuários curtiram (Abordagem 1):
Repo Men
The Founder
Beasts of No Nation
Dragon Ball Z: Super Android 13! (Doragon bôru Z 7: Kyokugen batoru!! San dai sûpâ saiyajin)
Anomalisa

Recomendações - Filmes parecidos com La La Land (Abordagem 2):
Crazy, Stupid, Love.
Gangster Squad
Paper Man
Easy A
'Round Midnight
Encerrando...
