In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Import datasets
movies = pd.read_csv("movielens/movies.csv")
ratings = pd.read_csv("movielens/ratings.csv")

# Exploring the data

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


## Eliminar datos innecesarios

In [5]:
ratings.drop(columns = "timestamp")

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
5,1,112,3.5
6,1,151,4.0
7,1,223,4.0
8,1,253,4.0
9,1,260,4.0


## Añadir ID al rating

In [6]:
ratings['Id'] = ratings.index

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,Id
0,1,2,3.5,1112486027,0
1,1,29,3.5,1112484676,1
2,1,32,3.5,1112484819,2
3,1,47,3.5,1112484727,3
4,1,50,3.5,1112484580,4


# Model

## Transformar el dataset

Se busca transformar el conjunto de datos en una matriz M x N donde M será la cantidad de películas y N será la cantidad de usuarios. Para ello se usará la función pivot

In [8]:
from scipy.sparse import csr_matrix

#pivot ratings en características de películas
df_movie_features = ratings.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating',
).fillna(0)

#parseo a una matriz scipy

mat_movie_features = csr_matrix(df_movie_features.values)


In [9]:
df_movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,5.0,4.5
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,3.5,0.0,0.0,0.0,0.0


## Construir el modelo

Debido a que la distancia euclidea resultará muy ineficiente en dimensiones tan grandes como la del problema planteado, para medir las distancias, se empleará la **similitud coseno**. 

In [10]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(n_neighbors=20, algorithm='auto').fit(df_movie_features)

# Recomendaciones

In [None]:
distances, indices = model_knn.kneighbors(df_movie_features)

In [None]:
indices