In [1]:
%load_ext autoreload
%autoreload 2

# En esta notebook se analizan los embeddings y las distancias

In [2]:
import pandas as pd

In [3]:
header = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv('./ml-100k/u.data', sep='\t', names=header)

u_unique = ratings.userId.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movieId.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

In [4]:
ratings.userId = ratings.userId.apply(lambda x: user2Idx[x])
ratings.movieId = ratings.movieId.apply(lambda x: movie2Idx[x])

In [5]:
n_split = 20000
ratings_train = ratings[n_split:]
ratings_val = ratings[:n_split]
len(ratings_train), len(ratings_val)

(80000, 20000)

In [6]:
n_users = int(ratings.userId.nunique())
n_movies = int(ratings.movieId.nunique())
n_users_train = int(ratings_train.userId.nunique())
n_movies_train = int(ratings_train.movieId.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1650


In [7]:
max_rating = ratings_train['rating'].max()
min_rating = ratings_train['rating'].min()
av_rating = ratings_train['rating'].mean()
max_rating, min_rating, av_rating

(5, 1, 3.52835)

# Definición RED

In [15]:
from keras.layers import Input, Embedding, Flatten, Dropout, Concatenate, Dense, Activation, Lambda
from keras import Model
from keras.optimizers import Adam
from keras.regularizers import l2

In [12]:
# Diferencia: las dimensiones de los Latent factors pueden ser distintos
n_latent_factors_user = 5
n_latent_factors_movie = 8

In [13]:
genre = ['Action', 'Adventure', 'Animation',
              "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']

In [23]:
# genre_input = Input(shape=[len(genre)],name='genre')
# timestamp_input = Input(shape=[1],name='timestamp')
movie_input = Input(shape=[1],name='Item')
movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer = l2(0.001))(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)
# movie_vec = Dropout(0.2)(movie_vec)


user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding', embeddings_regularizer = l2(0.001))(user_input))
# user_vec = Dropout(0.2)(user_vec)


concat = Concatenate(name='Concat')([movie_vec, user_vec, 
                                    #  timestamp_input, genre_input
                                     ])
# concat = Dropout(0.2)(concat)

x = Dense(50,name='FullyConnected-1', activation='relu', kernel_regularizer=l2(0.001))(concat)
#x = Dropout(0.5)(x)
#x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
#x = Dropout(0.5)(x)


## Se pueden sacar las siguientes dos lineas para no forzar a sigmoidea
x = Dense(1, activation='sigmoid',name='Activation')(x)
x = Lambda(lambda z: (max_rating - min_rating) * z + min_rating)(x)
##

model = Model([
    user_input, movie_input,
    # timestamp_input, genre_input
], x)
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Item (InputLayer)           [(None, 1)]                  0         []                            
                                                                                                  
 User (InputLayer)           [(None, 1)]                  0         []                            
                                                                                                  
 Movie-Embedding (Embedding  (None, 1, 8)                 13464     ['Item[0][0]']                
 )                                                                                                
                                                                                                  
 User-Embedding (Embedding)  (None, 1, 5)                 4720      ['User[0][0]']          

In [24]:
import keras.backend as K 
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [25]:
model.load_weights('weights.hdf5')

ValueError: Cannot assign value to variable ' FullyConnected-1/kernel:0': Shape mismatch.The variable shape (13, 50), and the assigned value shape (14, 50) are incompatible.

In [19]:
model.compile('adam', loss= 'mean_squared_error', metrics=[root_mean_squared_error])

In [20]:
model.evaluate([ratings_val.userId, ratings_val.movieId], ratings_val.rating)

ValueError: in user code:

    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/engine/training.py", line 1972, in test_function  *
        return step_function(self, iterator)
    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/engine/training.py", line 1956, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/engine/training.py", line 1944, in run_step  **
        outputs = model.test_step(data)
    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/engine/training.py", line 1850, in test_step
        y_pred = self(x, training=False)
    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/julian/opt/anaconda3/envs/sistemas-de-recomendacion/lib/python3.9/site-packages/keras/src/engine/input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_1" expects 4 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(32, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:1' shape=(32, 1) dtype=int64>]


In [14]:
model.evaluate([ratings_train.userId, ratings_train.movieId], ratings_train.rating)



[0.7168575762152671, 0.8384677067518235]

# Obtengo embeddings

In [15]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

In [16]:
movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [17]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]
movie_embeddings_matrix.shape, user_embeddings_matrix.shape

((1683, 8), (944, 5))

In [18]:
user_embeddings_matrix

array([[-0.00264374,  0.01769016, -0.00087199,  0.03644114, -0.00256097],
       [ 0.02881821,  0.02293316, -0.12388905, -0.035484  ,  0.04630387],
       [-0.02438038,  0.16275951,  0.02213055,  0.00948885,  0.04875376],
       ...,
       [ 0.1143712 ,  0.03287313, -0.05181557,  0.10909854,  0.05112425],
       [ 0.02281246,  0.04480752, -0.04627338,  0.0833808 ,  0.0855655 ],
       [-0.10972215, -0.05335915,  0.06947697, -0.18492909, -0.02650853]],
      dtype=float32)

In [19]:
movie_embeddings_matrix[1:3]

array([[-0.06243914, -0.01299466, -0.0363231 ,  0.01600152, -0.04616982,
         0.05115818,  0.06778049,  0.02213566],
       [-0.05221412, -0.07284259, -0.05342701,  0.0280623 , -0.06453912,
         0.0606089 ,  0.01014005,  0.02806689]], dtype=float32)

# Los puedo obtener definiendo un modelo nuevo

In [20]:
model_test_emb = Model([movie_input], [movie_embedding])

In [21]:
model_test_emb.predict([2])

array([[[-0.05221412, -0.07284259, -0.05342701,  0.0280623 ,
         -0.06453912,  0.0606089 ,  0.01014005,  0.02806689]]],
      dtype=float32)

# Nearest Neighbors 

In [22]:
from sklearn.neighbors import NearestNeighbors

In [23]:
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine').fit(movie_embeddings_matrix)

In [24]:
neighbors = nbrs.kneighbors([movie_embeddings_matrix[2]])

In [25]:
neighbors

(array([[0.        , 0.00565982, 0.01575363, 0.0197643 , 0.02283114,
         0.02301013, 0.02462351, 0.02732092, 0.03116977, 0.03117776]],
       dtype=float32),
 array([[   2,   32,  755,  781,   67, 1668,  131,  218,  186,  301]]))

In [26]:
movie_embeddings_matrix[688]

array([-0.03038532, -0.02277231, -0.03699017,  0.00243417, -0.03566373,
        0.03891908,  0.03387292, -0.00154995], dtype=float32)

# Ordenar por ratings de peliculas no vistas

In [27]:
# Todas las peliculas que el usuario 1 califico

ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,3,881250949
940,1,529,4,881251863
1133,1,378,4,881251728
1812,1,523,3,881251274
1896,1,432,5,881251793
2374,1,835,5,881252017
6910,1,381,4,881251021
7517,1,330,4,881251820
7842,1,551,5,881251911
10017,1,84,4,881251793


In [28]:
import numpy as np

In [29]:
all_movie_idxs = np.linspace(1, ratings.movieId.max(), n_movies, dtype=int)
print(all_movie_idxs)
user_idxs = np.zeros(n_movies, dtype=int) + 1
print(user_idxs)

[   1    2    3 ... 1680 1681 1682]
[1 1 1 ... 1 1 1]


In [30]:
predictions = model.predict([user_idxs,all_movie_idxs])

In [31]:
predictions.max(), predictions.min()

(4.2572484, 1.2087194)

In [32]:
predictions

array([[3.738341 ],
       [3.844197 ],
       [2.0892625],
       ...,
       [3.3779411],
       [3.2679176],
       [3.4160886]], dtype=float32)

In [33]:
np.argsort(predictions[:,0])[::-1]

array([ 180,  277,  200, ..., 1429,  634,  850])

In [34]:
predictions[1436]

array([3.6547675], dtype=float32)