### Importação e Tratamento de Dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Datasets do MovieLens obtidos através do link: https://files.grouplens.org/datasets/movielens/ml-latest.zip
links_df = pd.read_csv('./data/links.csv')
ratings_df = pd.read_csv('./data/ratings.csv')

# Pode ser obtido através do link: https://developer.themoviedb.org/docs/daily-id-exports
tmdb_df = pd.read_csv('./data/tmdb_ids.csv')

# Datasets obtidos através da plataforma Letterboxd
pedro_df = pd.read_csv('./data/pedro_ratings.csv')
sarah_df = pd.read_csv('./data/sarah_ratings.csv')

In [3]:
# Tratando o dataset de filmes para que fique no mesmo formato do meu dataset de avaliações
tmdb_df.drop(columns=['adult', 'popularity', 'video'], inplace=True)
tmdb_df.rename(columns={'id': 'tmdbId', 'original_title': 'Title'}, inplace=True)

# Conectando os datasets de filmes e links
movies_tmdb = pd.merge(tmdb_df, links_df, on='tmdbId', how='inner')
movies_tmdb.drop(columns=['imdbId', 'tmdbId'], inplace=True)

# Excluindo os filmes que estavam no link dataset mas não estavam no tmdb dataset
movies_tmdb.dropna(subset=['Title'], inplace=True)
movies_tmdb

Unnamed: 0,Title,movieId
0,Blondie,176399
1,Ariel,4470
2,Varjoja paratiisissa,61724
3,Four Rooms,18
4,Judgment Night,479
...,...,...
30356,人情紙風船,87033
30357,Ghost,154170
30358,Stories of Lost Souls,54988
30359,La Tête d'un homme,151040


In [4]:
# Criar um arquivo csv com o dataset de filmes tratado
movies_tmdb.to_csv('./data/movies_tmdb.csv', index=False)

In [5]:
# Tratando os datasets contendo as minhas avaliações e as da Sarah
vector = [pedro_df, sarah_df]
for df in vector:
  df.drop(columns=['Year', 'Letterboxd URI', 'Date'], inplace=True)
  df.rename(columns={'Rating': 'rating', 'Name': 'Title'}, inplace=True)

pedro_df

Unnamed: 0,Title,rating
0,Neon Genesis Evangelion: The End of Evangelion,5.0
1,Neon Genesis Evangelion: Death and Rebirth,4.5
2,Kizumonogatari Part 1: Tekketsu,3.5
3,Kizumonogatari Part 3: Reiketsu,4.0
4,Kizumonogatari Part 2: Nekketsu,3.0
...,...,...
542,Resident Evil: Welcome to Raccoon City,1.0
543,Black Panther: Wakanda Forever,3.0
544,Spider-Man: Across the Spider-Verse,4.5
545,Mack & Rita,1.5


In [6]:
# Conectando os datasets com as nossas avaliações com o dataset de filmes
pedro_ratings = pd.merge(movies_tmdb, pedro_df, on='Title')
pedro_ratings['userId'] = 1
pedro_ratings.drop(columns=['Title'], inplace=True)

sarah_ratings = pd.merge(movies_tmdb, sarah_df, on='Title')
sarah_ratings['userId'] = 2
sarah_ratings.drop(columns=['Title'], inplace=True)

pedro_ratings

Unnamed: 0,movieId,rating,userId
0,260,4.5,1
1,3910,1.5,1
2,6874,4.0,1
3,2329,3.0,1
4,541,3.5,1
...,...,...,...
218,97188,2.0,1
219,99728,3.0,1
220,112175,2.5,1
221,109846,2.5,1


In [7]:
# Contando quantas vezes cada filme foi avaliado e quantas vezes cada usuário avaliou
movies_ratings = ratings_df['movieId'].value_counts()
users_ratings = ratings_df['userId'].value_counts()

# Filtrando os filmes e usuários que possuem mais de 1000 e 100 avaliações, respectivamente
movies_updated = movies_ratings[movies_ratings >= 2000].index
users_updated = users_ratings[users_ratings >= 1000].index

# Modificando os datasets para conter apenas os filmes e usuários que possuem mais de 1000 e 100 avaliações, respectivamente
filtered_ratings = ratings_df[(ratings_df['movieId'].isin(movies_updated)) & (ratings_df['userId'].isin(users_updated))]

# Dropando a coluna timestamp
filtered_ratings = filtered_ratings.drop(columns=['timestamp'])
filtered_ratings

Unnamed: 0,userId,movieId,rating
7662,81,2,3.0
7663,81,3,3.5
7664,81,6,4.0
7665,81,10,2.5
7666,81,16,5.0
...,...,...,...
27751534,283195,95167,4.0
27751538,283195,103141,4.0
27751539,283195,103253,1.5
27751542,283195,116797,3.5


In [8]:
# Verificando se os filmes que nós avaliamos estão no dataset filtrado de filmes
pedro_ratings = pedro_ratings[pedro_ratings['movieId'].isin(movies_updated)]
sarah_ratings = sarah_ratings[sarah_ratings['movieId'].isin(movies_updated)]

# Dropando as linhas duplicadas
pedro_ratings = pedro_ratings.drop_duplicates(subset=['movieId'], keep='first')
sarah_ratings = sarah_ratings.drop_duplicates(subset=['movieId'], keep='first')

pedro_ratings

Unnamed: 0,movieId,rating,userId
0,260,4.5,1
1,3910,1.5,1
2,6874,4.0,1
3,2329,3.0,1
4,541,3.5,1
...,...,...,...
210,105844,4.0,1
212,106072,1.0,1
213,122882,4.0,1
220,112175,2.5,1


In [9]:
# Adicionando as nossas avaliações ao dataset de avaliações
merged_ratings = pd.concat([filtered_ratings, pedro_ratings], ignore_index=True)
merged_ratings = pd.concat([merged_ratings, sarah_ratings], ignore_index=True)

# Salvando os ids dos filmes que já avaliamos, para filtrar as recomendações depois
pedro_movies = merged_ratings[merged_ratings['userId'] == 1]['movieId']
sarah_movies = merged_ratings[merged_ratings['userId'] == 2]['movieId']
pedro_movies = pedro_movies.tolist()
sarah_movies = sarah_movies.tolist()

In [10]:
# Criação da matriz de Usuários x Filmes que iremos utilizar para a recomendação
users_movies = merged_ratings.pivot(index='userId',columns='movieId',values='rating').fillna(0.0)
users_movies 

movieId,1,2,3,4,5,6,7,9,10,11,...,166635,166643,168250,168252,171763,174055,176371,177593,177765,179819
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81,0.0,3.0,3.5,0.0,0.0,4.0,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,0.0,3.0,0.0,0.0,2.0,3.0,0.0,2.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282403,2.5,2.0,0.0,0.0,0.0,4.5,0.0,2.5,3.5,0.0,...,0.0,0.0,0.0,3.0,0.0,3.5,4.0,0.0,0.0,3.5
282748,3.5,4.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.5,0.0
282808,4.0,4.0,2.5,0.0,3.0,0.0,0.0,0.0,4.5,0.0,...,0.0,0.0,3.0,0.0,2.0,4.5,0.0,3.5,5.0,4.0
283000,4.5,0.0,0.0,0.0,3.5,0.0,3.0,0.0,3.5,3.0,...,0.0,4.5,4.0,4.0,4.0,4.0,3.5,4.0,4.0,4.0


### Abordagem Crua

In [11]:
# Função responsável for fatorar a matrix de Usuários x Filmes
def matrix_factorization(UM, F=5, steps=10, _lambda=0.0002, beta=0.02):
  # Inicialização das matrizes de Usuários x Fatores e Fatores x Filmes
  UF = np.random.rand(len(UM),F)
  FM = np.random.rand(len(UM[0]),F)
  FM = FM.T

  # Loop para realizar a fatoração da matriz
  for step in range(steps):
    print('Gerando sua lista de recomendações: {}%'.format(step*10))
    
    # Loop para percorrer a matriz de Usuários x Filmes
    for i in range(len(UM)):
      for j in range(len(UM[i])):
        if UM[i][j] > 0:
          # Cálculo do erro entre o valor real e o valor predito
          eij = UM[i][j] - np.dot(UF[i,:],FM[:,j])
          
          # Loop de otimização das matrizes utilizando o gradiente descendente
          for f in range(F):
            UF[i][f] = UF[i][f] + _lambda * (2 * eij * FM[f][j] - beta * UF[i][f])
            FM[f][j] = FM[f][j] + _lambda * (2 * eij * UF[i][f] - beta * FM[f][j])
    
  # Cálculo do erro total ao final da fatoração
  e = 0
  for i in range(len(UM)):
    for j in range(len(UM[i])):
      if UM[i][j] > 0:
        e = e + pow(UM[i][j] - np.dot(UF[i,:],FM[:,j]), 2)
        for f in range(F):
          e = e + (beta/2) * (pow(UF[i][f],2) + pow(FM[f][j],2))

  print('Erro encontrado ao final: ', e)
  return UF, FM.T

In [12]:
# Função para gerar a lista de recomendações
def couple_recommendations(matrix, qty_movies):
  UM = np.array(matrix)
  UF, FM = matrix_factorization(UM)
  UM_pred = np.dot(UF, FM.T)
  
  # Transformando a matriz de Usuários x Filmes em um dataframe para facilitar a manipulação
  users_movies_pred = pd.DataFrame(UM_pred, columns=matrix.columns, index=matrix.index)
  
  # Removendo os filmes do dataframe que já foram avaliados por nós
  for i in users_movies_pred.columns:
    if i in pedro_movies or i in sarah_movies:
      users_movies_pred.drop(columns=[i], inplace=True)
  
  # Eliminando todas as linhas que não sejam do usuário com id 1 e 2
  users_movies_pred = users_movies_pred.loc[[1,2]]
  
  # Criando uma linha com a média das avaliações de cada filme
  users_movies_pred.loc['avg'] = users_movies_pred.mean()
  
  # Ordenando os filmes pela média e pegando os n primeiros filmes
  top_n = users_movies_pred.sort_values(by='avg', axis=1, ascending=False).iloc[:, :qty_movies]
  top_n = top_n.T
  
  # Retornando os filmes recomendados
  recommendations = movies_tmdb[movies_tmdb['movieId'].isin(top_n.index)]['Title']

  return recommendations

In [13]:
recommendations = couple_recommendations(users_movies, 10)
recommendations

Gerando sua lista de recomendações: 0%
Gerando sua lista de recomendações: 10%
Gerando sua lista de recomendações: 20%
Gerando sua lista de recomendações: 30%
Gerando sua lista de recomendações: 40%
Gerando sua lista de recomendações: 50%
Gerando sua lista de recomendações: 60%
Gerando sua lista de recomendações: 70%
Gerando sua lista de recomendações: 80%
Gerando sua lista de recomendações: 90%


### Abordagem com o PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [None]:
# Criando uma sessão do Spark
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Criando um dataframe do Spark com as avaliações já filtradas no Pandas
spark_merged_ratings = spark.createDataFrame(merged_ratings)

# Separando o dataframe em treino e teste
train, test = spark_merged_ratings.randomSplit([0.8, 0.2])

# Criando o modelo de recomendação utiliazndo o Mínimos Quadrados Alternados
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative = True)

# Treinando o modelo de recomendação com o dataset de treino
model = als.fit(train)

# Após o trieno, realizando as predições com o dataset de teste
predictions = model.transform(test)

# Avaliando o modelo de recomendação através da técnica RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
# Método utilizado para gerar as recomendações para todos os usuários
movie_recommendations = model.recommendForAllUsers(10)
movie_recommendations = movie_recommendations.toPandas()

# Filtrando apenas as recomendações do usuário com id 1
my_recommendations = movie_recommendations[movie_recommendations['userId'] == 1]['recommendations']
my_recommendations

In [None]:
# Criando um dataframe com as recomendações do usuário com id 1
my_recommendations = pd.DataFrame(my_recommendations.values[0])
my_recommendations.columns = ['movieId', 'rating']
my_recommendations.drop(columns=['rating'], inplace=True)
list_of_recommendations = my_recommendations.merge(movies_tmdb, on='movieId', how='inner')

# Deletrando os filmes que eu já vi
list_of_recommendations = list_of_recommendations[~list_of_recommendations['movieId'].isin(pedro_movies)]
list_of_recommendations