<a href="https://colab.research.google.com/github/pedrohenriquecordeiro/KNN-movie-recommendation/blob/master/sistema_recomendacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

movies = pd.read_csv("/content/drive/My Drive/data/movies.csv")
ratings = pd.read_csv("/content/drive/My Drive/data/ratings.csv")

### Detalhes do dataset MOVIES

In [3]:
# set o id
movies = movies.set_index('movieId')
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.describe()

Unnamed: 0,title,genres
count,58098,58098
unique,58020,1643
top,Interrogation (2016),Drama
freq,2,8402


### Detalhes do dataset RATINGS

In [5]:
# set o id
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,27753440.0,27753440.0,27753440.0,27753440.0
mean,141942.0,18488.0,3.530445,1193122000.0
std,81707.4,35102.63,1.066353,216048200.0
min,1.0,1.0,0.5,789652000.0
25%,71176.0,1097.0,3.0,998605300.0
50%,142022.0,2716.0,3.5,1174256000.0
75%,212459.0,7150.0,4.0,1422744000.0
max,283228.0,193886.0,5.0,1537945000.0


In [0]:
# limita o tamanho do dataset
# ratings = ratings.sample(frac = 0.001)
# ratings.describe()

### Cria um novo dataset MOVIES

* numero de votos do cada filme

* media das notas de cada filme

In [0]:
movies['total_ratings'] = ratings["movieId"].value_counts()
movies["mean_ratings"] = ratings.groupby("movieId").mean()["rating"]

Filtra os filmes com mais de 50 votos e ordena a partir da media de votos (0 a 5)

In [9]:
filtered_movies = movies.query("total_ratings >= 50").sort_values("mean_ratings", ascending = False)
filtered_movies.head(8)

Unnamed: 0_level_0,title,genres,total_ratings,mean_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
171011,Planet Earth II (2016),Documentary,853.0,4.486518
159817,Planet Earth (2006),Documentary,1384.0,4.458092
318,"Shawshank Redemption, The (1994)",Crime|Drama,97999.0,4.424188
170705,Band of Brothers (2001),Action|Drama|War,984.0,4.399898
174053,Black Mirror: White Christmas (2014),Drama|Horror|Mystery|Sci-Fi|Thriller,1074.0,4.350559
171495,Cosmos,(no genres listed),157.0,4.343949
172591,The Godfather Trilogy: 1972-1990 (1992),(no genres listed),421.0,4.339667
858,"Godfather, The (1972)",Crime|Drama,60904.0,4.332893


### Define algumas funções auxiliares

In [0]:
import numpy as np
import random

# distância euclidiana
def distance(a,b):
  return np.linalg.norm(a - b)
  
# retorna apenas as notas de um user
def user_rating(user):
  r = ratings.query("userId==%d" % user)
  r = r[["movieId", "rating"]].set_index("movieId")
  return r

# retorna a distância euclidiana entre dois users
def user_distance(user1, user2, movies_common = 10):
  r1 = user_rating(user1)
  r2 = user_rating(user2)
  
  # une as notas dos dois usuários ( teremos os filmes em comum )
  # drop em rows com NaN
  d = r1.join(r2, lsuffix="_1", rsuffix="_2").dropna()
  
  # user com poucos filmes não são listados
  if(len(d) < movies_common):
    return None
  else:
    return [user1,user2,distance(d['rating_1'], d['rating_2'])]

# retorna a distância de user_id para number_users_to_analyze users
def distance_from(user_id, number_users_to_analyze = None):
  all_ = ratings['userId'].unique()
  # randomiza
  np.random.shuffle(all_)
  # retira o user_id
  all_ = all_[all_ != user_id]
  # seleciona apenas uma parte [ fica muito lento com todos os dados]
  all_ = all_[:number_users_to_analyze]
  
  distance = []
  for u_id in all_:
    value_distance = user_distance(user_id, u_id)
    if value_distance != None:
      distance.append(value_distance)  
  
  distance = pd.DataFrame(distance, columns = ["me", "outhers", "distance"])
  return distance

# calcula a distância de um user para todos os outros users ( KNN )
def knn(user_id, k_nearest = None):
  distances = distance_from(user_id,number_users_to_analyze = 100)
  distances = distances.sort_values("distance")
  # ignoro o erro - pois o drop alerta quando eu tento eliminar rows com index que nao existem no dataframe
  distances = distances.set_index("outhers").drop(user_id, errors = 'ignore')
  # por padrao retorna todos, porém posso definir um limite
  return distances.head(k_nearest)

#### Pessoa mais similar

In [11]:
# pega o mais similar de um user
user = 5
similar = knn(user).iloc[0].name
similar

126893

#### Notas do mais similar

In [12]:
# busca as notas do similar
ratings_similar = user_rating(similar)

# elimina as notas dos filmes que o user analisado já assistiu ( usa o index poq o movieId é o indice do dataframe)
ratings_user_index = user_rating(user).index
ratings_similar = ratings_similar.drop(ratings_user_index,errors = 'ignore')
ratings_similar.sort_values('rating',ascending = False).head().join(movies)

Unnamed: 0_level_0,rating,title,genres,total_ratings,mean_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,68469.0,3.886649
1060,5.0,Swingers (1996),Comedy|Drama,10834.0,3.822088
1733,5.0,Afterglow (1997),Drama|Romance,570.0,3.27807
1701,5.0,Deconstructing Harry (1997),Comedy|Drama,3368.0,3.468973
1685,5.0,"I Love You, I Love You Not (1996)",Drama|Romance,97.0,2.953608


### Tem o risco de se recomendar um filme de nicho para o user,portanto não iremos basear a recomendação em apenas uma pessoa similar, mas em várias

In [13]:
# pega até dez mais similares
user = 5
sames = knn(user,k_nearest = 10).index
sames

Int64Index([91561, 155873, 94994, 56382, 3413, 196396, 93627, 19940, 216859,
            128118],
           dtype='int64', name='outhers')

In [24]:
# busca as notas dos similares
ratings_sames = ratings.set_index('userId').loc[sames]
ratings_sames = ratings_sames.groupby('movieId').mean()[['rating']]
# conta o numero de vezes cada filme foi avaliada dentro dos k nearest neighbors
apparitions = ratings_sames.groupby('movieId').count()[['rating']]
ratings_sames = ratings_sames.join(apparitions,lsuffix = '_mean_k_nearest',rsuffix = '_total_k_nearest')
# filtra os filmes para obtermos apenas filmes com um numero de avaliacoes dos k nearest neighbors minima
# escolhi o minimo de 5 avaliacoes
# ratings_sames = ratings_sames.query('rating_total_k_nearest >= 5')
# remove os filmes que o user já assistiu
movies_already_watched = user_rating(user).index
ratings_sames = ratings_sames.drop(movies_already_watched , errors='ignore')
ratings_sames.sort_values('rating_mean_k_nearest',ascending = False).head().join(movies)

Unnamed: 0_level_0,rating_mean_k_nearest,rating_total_k_nearest,title,genres,total_ratings,mean_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
920,5.0,1,Gone with the Wind (1939),Drama|Romance|War,18333.0,3.817788
1221,5.0,1,"Godfather: Part II, The (1974)",Crime|Drama,38875.0,4.263035
1227,5.0,1,Once Upon a Time in America (1984),Crime|Drama,5142.0,3.966064
72720,5.0,1,"Single Man, A (2009)",Drama,1558.0,3.794288
7090,5.0,1,Hero (Ying xiong) (2002),Action|Adventure|Drama,8975.0,3.898942


### Cria funcao que recomenda filmes usando o algoritmo KNN ( k nearest neighbors )

In [0]:
def recommend(user_id , k_nearest):
  # busca as notas dos similares
  ratings_sames = ratings.set_index('userId').loc[sames]
  ratings_sames = ratings_sames.groupby('movieId').mean()[['rating']]
  # conta o numero de vezes cada filme foi avaliada dentro dos k nearest neighbors
  apparitions = ratings_sames.groupby('movieId').count()[['rating']]
  ratings_sames = ratings_sames.join(apparitions,lsuffix = '_mean_k_nearest',rsuffix = '_total_k_nearest')
  # filtra os filmes para obtermos apenas filmes com um numero de avaliacoes dos k nearest neighbors minima
  # escolhi o minimo de 5 avaliacoes
  # ratings_sames = ratings_sames.query('rating_total_k_nearest >= 5')
  # remove os filmes que o user já assistiu
  movies_already_watched = user_rating(user).index
  ratings_sames = ratings_sames.drop(movies_already_watched , errors='ignore')
  ratings_sames = ratings_sames.sort_values('rating_mean_k_nearest',ascending = False).head().join(movies)
  # remove os filmes que o user já assistiu

  return ratings_sames


In [26]:
# recomenda filmes para o user 50, avaliando até 40 similares a ele, ao selecionar 100 dos usuario de todo o universo para analisar
recommend(50,k_nearest = 40)

Unnamed: 0_level_0,rating_mean_k_nearest,rating_total_k_nearest,title,genres,total_ratings,mean_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
920,5.0,1,Gone with the Wind (1939),Drama|Romance|War,18333.0,3.817788
1221,5.0,1,"Godfather: Part II, The (1974)",Crime|Drama,38875.0,4.263035
1227,5.0,1,Once Upon a Time in America (1984),Crime|Drama,5142.0,3.966064
72720,5.0,1,"Single Man, A (2009)",Drama,1558.0,3.794288
7090,5.0,1,Hero (Ying xiong) (2002),Action|Adventure|Drama,8975.0,3.898942


#### o proximo passo é filtrar o filmes com baixa votação dentro dos k nearest neighbour
#### a biblioteca suprise implementa o KNN