# Recomendações Personalizadas
## Filtragem colaborativa - item-item

Usando dados baseados em itens para encontrar filmes semelhantes com base em como eles foram classificados pelos usuários.

In [93]:
# Importandos as bibliotecas  

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor

In [94]:
# Importando os dados e armazendo em um DataFrame

# Importando os dados 
user_ratings = pd.read_csv("/Users/lucasmartins/Documents/LucasMartins/Projetos/Recomendacao/Recomendacao_nao_personalizada/user_ratings.csv")

# Inspecionando o DataFrame
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [95]:
# Transformando o DataFrame em uma matriz de classificação de usuários onde cada linha representa um usuário e
# cada coluna representa os filmes na plataforma.

# Transformando o DataFrame
user_ratings_pivot = user_ratings.pivot_table(index="title", columns="userId", values="rating")

# Inspecionando a tabela transformada
user_ratings_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [96]:
# Preenchendo os dados ausentes com informações que não devem influenciar os dados da análise.
# Será obtido a pontuação média que cada usuário deu em todas as classificaçoes e, em seguida, 
# utilizaremos essa média para centralizar as pontuações dos usuários em torno de zero, que será
# uma pontuação neutra, minimizando o impacto em seu perfil geral, permitindo a comparação dos usuários.

# Obtendo a classificação média para cada usuário 
avg_ratings = user_ratings_pivot.mean(axis=1)

# Centralizando as avaliações de cada usuário em torno de 0
user_ratings_table_centered = user_ratings_pivot.sub(avg_ratings, axis=0)

# Preenchendo os dados com 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

# Inspecionando a tabela normalizada
user_ratings_table_normed




userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
'Hellboy': The Seeds of Creation (2004),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
'Round Midnight (1986),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
'Salem's Lot (2004),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
'Til There Was You (1997),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,1.136364,0.0,0.0,0.0,0.0,0.636364,0.0,0.000000
xXx (2002),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.770833,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.729167,0.0,-0.770833
xXx: State of the Union (2005),0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,-0.500000
¡Three Amigos! (1986),0.865385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


In [97]:
# Calculando as semelhanças e distâncias entre itens no conjunto de dados.
# Será utilizado a distância cosseno, os dados já foram centralizados em torno
# de zero, os valores do cosseno agora podem variar de -1 a 1, sendo 1 o mais semelhante e -1 o mínimo.
# A similaridade de cosseno compara duas matrizes NumPy.

# Cosseno de similaridade é a medida do ângulo entre dois documentos no espaço métrico de alta dimensão 

# Obtendo as linhas que serão comparadas
sw_IV = user_ratings_table_normed.loc['Star Wars: Episode IV - A New Hope (1977)', :].values.reshape(1, -1)
sw_V = user_ratings_table_normed.loc['Star Wars: Episode V - The Empire Strikes Back (1980)', :].values.reshape(1, -1)

# Transformando em uma matriz NumPy com os valores de pontos
similarity_A = cosine_similarity(sw_IV, sw_V)
print(similarity_A)

[[0.56879723]]


In [98]:
# Fazendo as recomendações, encontrando itens mais semelhantes em geral.
# Para isso, é preciso encontrar as semalhanças entre todos os itens de uma vez.
# Resultando em uma matriz de similaridade entre todos os itens.
# Com essam atriz calculada, será possível fazer recomendações, encontrando os itens que
# foram avaliados mais semelhante ao que um usuário gostou ao selecionar com o qual
# deseja comparar e classificar suas semelhanças.


# Gerando a matriz de similaridade
similarities = cosine_similarity(user_ratings_table_normed)

# Envolvendo as semelhanças em um DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=user_ratings_table_normed.index, columns=user_ratings_table_normed.index)

# Encontrando os valores de similaridade para um filme específico
cosine_similarity_series = cosine_similarity_df.loc["Star Wars: Episode IV - A New Hope (1977)"]

# Ordenando os valores do maior para o menor
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Inspecionando as semelhanças
ordered_similarities.head(5)




title
Star Wars: Episode IV - A New Hope (1977)                                         1.000000
Star Wars: Episode V - The Empire Strikes Back (1980)                             0.568797
Star Wars: Episode VI - Return of the Jedi (1983)                                 0.533030
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.232904
Indiana Jones and the Last Crusade (1989)                                         0.230792
Name: Star Wars: Episode IV - A New Hope (1977), dtype: float64