In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dropout, Dense, Concatenate, Lambda, BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers.legacy import SGD, Adam
import keras.backend as K
from keras.metrics import RootMeanSquaredError
import matplotlib.pyplot as plt
from data.data_frame_loader import DataFrameLoader
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from tensorflow.keras.constraints import max_norm



In [5]:
df_personas, df_trabajadores, df_usuarios, df_peliculas, df_scores = DataFrameLoader.load_all("csv_files/personas.csv","csv_files/trabajadores.csv","csv_files/usuarios.csv","csv_files/peliculas.csv","csv_files/scores.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Release Date"] = pd.to_datetime(df["Release Date"], format="%d-%b-%Y")


In [6]:
u_unique = df_scores.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}  #Reindexa el ID del usuario

m_unique = df_scores.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)} #Reindexa el ID de la pelicula
idx2Movie = {v:k for k,v in movie2Idx.items()}

df_peliculas.head()

Unnamed: 0,id,Name,Release Date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),1995-01-01,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1995-01-01,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),1995-01-01,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1995-01-01,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),1995-01-01,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
def join_df(left, right, left_on, right_on=None):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on,
                      suffixes=("", "_y"))

df_scores = join_df(df_scores, df_peliculas, "movie_id", "id")

In [8]:
df_scores.head()

Unnamed: 0,id,user_id,movie_id,rating,Date,id_y,Name,Release Date,IMDB URL,unknown,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,196,242,3,1997-12-04 15:55:49,242,Kolya (1996),1997-01-24,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,1,186,302,3,1998-04-04 19:22:22,302,L.A. Confidential (1997),1997-01-01,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,...,0,1,0,0,1,0,0,1,0,0
2,2,22,377,1,1997-11-07 07:18:36,377,Heavyweights (1994),1994-01-01,http://us.imdb.com/M/title-exact?Heavyweights%...,0,...,0,0,0,0,0,0,0,0,0,0
3,3,244,51,2,1997-11-27 05:02:03,51,Legends of the Fall (1994),1994-01-01,http://us.imdb.com/M/title-exact?Legends%20of%...,0,...,0,0,0,0,0,1,0,0,1,1
4,4,166,346,1,1998-02-02 05:33:16,346,Jackie Brown (1997),1997-01-01,http://us.imdb.com/M/title-exact?imdb-title-11...,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_scores.user_id = df_scores.user_id.apply(lambda x: user2Idx[x])
df_scores.movie_id = df_scores.movie_id.apply(lambda x: movie2Idx[x])

In [24]:
df_scores_dev, df_scores_test = train_test_split(df_scores, test_size=0.15, random_state=42)
df_scores_train, df_scores_val = train_test_split(df_scores_dev, test_size=0.17, random_state=42)

In [12]:
#Agregar Unknow

genre = ['Action', 'Adventure', 'Animation',
              "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']

In [25]:

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [26]:
# Definir las dimensiones de los datos
n_users = int(df_scores.user_id.nunique())
n_movies = int(df_scores.movie_id.nunique())
n_users_train = int(df_scores_train.user_id.nunique())
n_movies_train = int(df_scores_train.movie_id.nunique())
n_genre = len(genre)
print(f"Usuarios: {n_users}\nPeliculas: {n_movies} \nUsuarios Train:{n_users_train} \nPeliculas Train: {n_movies_train}")


Usuarios: 943
Peliculas: 1681 
Usuarios Train:943 
Peliculas Train: 1628


In [38]:
# Extract data from DataFrames and convert to Numpy arrays or tensors
user_ids = df_scores_train.user_id.values
movie_ids = df_scores_train.movie_id.values
genre_data = df_scores_train[genre].values
ratings = df_scores_train.rating.values

user_val_ids = df_scores_val.user_id.values
movie_val_ids = df_scores_val.movie_id.values
genre_val_data = df_scores_val[genre].values
ratings_val = df_scores_val.rating.values

user_test_ids = df_scores_test.user_id.values
movie_test_ids = df_scores_test.movie_id.values
genre_test_data = df_scores_test[genre].values
ratings_test = df_scores_test.rating.values

In [28]:
# Define the checkpointer callback
checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True, monitor='val_root_mean_squared_error')

# Define the early_stopping callback
early_stopping = EarlyStopping(monitor='val_root_mean_squared_error', patience=10, verbose=1)

# Define the ReduceLROnPlateau callback
reduce_lr = ReduceLROnPlateau(monitor='val_root_mean_squared_error', factor=0.1, patience=4, min_lr=1e-8, verbose=1)

# Define the objective function to be minimized
def objective(params):
    # Extract hyperparameters from the parameter dictionary
    n_latent_factors_movie = int(params['n_latent_factors_movie'])
    n_latent_factors_user = int(params['n_latent_factors_user'])
    activation = params['activation']
    optimizer_name = params['optimizer']
    learning_rate = params['learning_rate']
    num_hidden_layers = int(params['num_hidden_layers'])
    num_neurons_per_layer = int(params['num_neurons_per_layer'])
    num_epochs = int(params['num_epochs'])
    batch_size = int(params['batch_size'])

    # Define the model architecture based on hyperparameters
    genre_input = Input(shape=[n_genre], name='genre')
    movie_input = Input(shape=[1], name='Item')
    user_input = Input(shape=[1], name='User')

    movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer=l2(0.01))(movie_input)
    movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

    user_embedding = Embedding(n_users + 1, n_latent_factors_user, name='User-Embedding', embeddings_regularizer=l2(0.01))(user_input)
    user_vec = Flatten(name='FlattenUsers')(user_embedding)

    concat = Concatenate(name='Concat')([movie_vec, user_vec, genre_input])

    for _ in range(num_hidden_layers):
        x = Dense(num_neurons_per_layer, activation=activation, kernel_regularizer=l2(0.0001), kernel_constraint=max_norm(2))(concat)
        x = Dropout(0.3)(x)

    output = Dense(1, activation='relu', name='Activation')(x)

    model = Model([user_input, movie_input, genre_input], output)

    # Compile the model with the optimizer and loss
    if optimizer_name == 'adam':
        opt = Adam(learning_rate=learning_rate)
    else:
        opt = SGD(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='mean_squared_error', metrics=[RootMeanSquaredError()])

    

    # Train the model using the specified hyperparameters
    history = model.fit(
        [user_ids, movie_ids, genre_data],
        ratings,
        validation_data=([user_val_ids, movie_val_ids, genre_val_data], ratings_val),
        batch_size=batch_size,
        epochs=num_epochs,
        verbose=2,
        callbacks=[early_stopping, checkpointer, reduce_lr]
    )

    # Calculate the validation loss
    val_loss = history.history['val_root_mean_squared_error'][-1]

    return {'loss': val_loss, 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'n_latent_factors_movie': hp.quniform('n_latent_factors_movie', 2, 15, 1),
    'n_latent_factors_user': hp.quniform('n_latent_factors_user', 2, 15, 1),
    'activation': hp.choice('activation', ['relu', 'tanh', 'sigmoid']),
    'optimizer': hp.choice('optimizer', ['adam', 'sgd']),
    'learning_rate': hp.loguniform('learning_rate', -4, -1),
    'num_hidden_layers': hp.quniform('num_hidden_layers', 2, 3, 1),
    'num_neurons_per_layer': hp.quniform('num_neurons_per_layer', 30, 200, 10),
    'num_epochs': hp.quniform('num_epochs', 10, 250, 5),
    'batch_size': hp.quniform('batch_size', 32, 128, 5)
}

# Initialize Trials object
tpe_trials = Trials()

# Run Bayesian optimization
best = fmin(fn=objective,
            space=space,
            trials=tpe_trials,
            algo=tpe.suggest,
            max_evals=20)

print("Best hyperparameters:", best)



Epoch 1/235                                           

                                                      
Epoch 1: val_root_mean_squared_error improved from inf to 3.69000, saving model to weights.hdf5

941/941 - 4s - loss: 13.8670 - root_mean_squared_error: 3.7099 - val_loss: 13.6164 - val_root_mean_squared_error: 3.6900 - lr: 0.1433 - 4s/epoch - 4ms/step

Epoch 2/235                                           

  0%|          | 0/20 [00:04<?, ?trial/s, best loss=?]

  saving_api.save_model(



                                                      
Epoch 2: val_root_mean_squared_error did not improve from 3.69000

941/941 - 3s - loss: 13.7574 - root_mean_squared_error: 3.7085 - val_loss: 13.6237 - val_root_mean_squared_error: 3.6900 - lr: 0.1433 - 3s/epoch - 3ms/step

Epoch 3/235                                           

                                                      
Epoch 3: val_root_mean_squared_error did not improve from 3.69000

941/941 - 2s - loss: 13.7572 - root_mean_squared_error: 3.7085 - val_loss: 13.6221 - val_root_mean_squared_error: 3.6900 - lr: 0.1433 - 2s/epoch - 3ms/step

Epoch 4/235                                           

                                                      
Epoch 4: val_root_mean_squared_error did not improve from 3.69000

941/941 - 3s - loss: 13.7575 - root_mean_squared_error: 3.7085 - val_loss: 13.6204 - val_root_mean_squared_error: 3.6900 - lr: 0.1433 - 3s/epoch - 3ms/step

Epoch 5/235                                        

  saving_api.save_model(



                                                                              
Epoch 2: val_root_mean_squared_error improved from 1.09655 to 1.08750, saving model to weights.hdf5

1568/1568 - 3s - loss: 1.4980 - root_mean_squared_error: 1.0925 - val_loss: 1.4187 - val_root_mean_squared_error: 1.0875 - lr: 0.0396 - 3s/epoch - 2ms/step

Epoch 3/140                                                                   

                                                                              
Epoch 3: val_root_mean_squared_error improved from 1.08750 to 1.08692, saving model to weights.hdf5

1568/1568 - 3s - loss: 1.5117 - root_mean_squared_error: 1.0903 - val_loss: 1.4536 - val_root_mean_squared_error: 1.0869 - lr: 0.0396 - 3s/epoch - 2ms/step

Epoch 4/140                                                                   

                                                                              
Epoch 4: val_root_mean_squared_error did not improve from 1.08692

1568/1568 - 4s - los

  saving_api.save_model(



                                                                                 
Epoch 4: val_root_mean_squared_error did not improve from 0.97221

672/672 - 3s - loss: 1.0120 - root_mean_squared_error: 0.9726 - val_loss: 1.0256 - val_root_mean_squared_error: 0.9803 - lr: 0.0707 - 3s/epoch - 4ms/step

Epoch 5/70                                                                       

                                                                                 
Epoch 5: val_root_mean_squared_error did not improve from 0.97221

672/672 - 2s - loss: 1.0059 - root_mean_squared_error: 0.9698 - val_loss: 1.0340 - val_root_mean_squared_error: 0.9846 - lr: 0.0707 - 2s/epoch - 3ms/step

Epoch 6/70                                                                       

                                                                                 
Epoch 6: val_root_mean_squared_error improved from 0.97221 to 0.97149, saving model to weights.hdf5

672/672 - 2s - loss: 1.0026 - root_mean_squ

  saving_api.save_model(



                                                                                 
Epoch 28: val_root_mean_squared_error improved from 0.93931 to 0.93801, saving model to weights.hdf5

706/706 - 3s - loss: 0.8905 - root_mean_squared_error: 0.9191 - val_loss: 0.9252 - val_root_mean_squared_error: 0.9380 - lr: 0.0017 - 3s/epoch - 4ms/step

Epoch 29/190                                                                     

                                                                                 
Epoch 29: val_root_mean_squared_error improved from 0.93801 to 0.93734, saving model to weights.hdf5

706/706 - 2s - loss: 0.8882 - root_mean_squared_error: 0.9183 - val_loss: 0.9233 - val_root_mean_squared_error: 0.9373 - lr: 0.0017 - 2s/epoch - 3ms/step

Epoch 30/190                                                                     

                                                                                 
Epoch 30: val_root_mean_squared_error improved from 0.93734 to 0.93733, sa

  saving_api.save_model(



                                                                                  
Epoch 42: val_root_mean_squared_error improved from 0.93640 to 0.93624, saving model to weights.hdf5

830/830 - 2s - loss: 0.8840 - root_mean_squared_error: 0.9142 - val_loss: 0.9244 - val_root_mean_squared_error: 0.9362 - lr: 0.0021 - 2s/epoch - 2ms/step

Epoch 43/150                                                                      

                                                                                  
Epoch 43: val_root_mean_squared_error improved from 0.93624 to 0.93545, saving model to weights.hdf5

830/830 - 2s - loss: 0.8862 - root_mean_squared_error: 0.9159 - val_loss: 0.9219 - val_root_mean_squared_error: 0.9354 - lr: 0.0021 - 2s/epoch - 3ms/step

Epoch 44/150                                                                      

                                                                                  
Epoch 44: val_root_mean_squared_error did not improve from 0.93545

8


Best hyperparameters: {'activation': 0, 'batch_size': 95.0, 'learning_rate': 0.02380991042060511, 'n_latent_factors_movie': 3.0, 'n_latent_factors_user': 2.0, 'num_epochs': 45.0, 'num_hidden_layers': 1.0, 'num_neurons_per_layer': 140.0, 'optimizer': 0}


In [30]:
def create_model_with_hyperparameters(n_latent_factors_movie, n_latent_factors_user, activation, optimizer, learning_rate, num_hidden_layers, num_neurons_per_layer):
    genre_input = Input(shape=[n_genre], name='genre')
    movie_input = Input(shape=[1], name='Item')
    user_input = Input(shape=[1], name='User')

    movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer=l2(0.01))(movie_input)
    movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

    user_embedding = Embedding(n_users + 1, n_latent_factors_user, name='User-Embedding', embeddings_regularizer=l2(0.01))(user_input)
    user_vec = Flatten(name='FlattenUsers')(user_embedding)

    concat = Concatenate(name='Concat')([movie_vec, user_vec, genre_input])

    for _ in range(num_hidden_layers):
        x = Dense(num_neurons_per_layer, activation=activation, kernel_regularizer=l2(0.0001), kernel_constraint=max_norm(2))(concat)
        x = Dropout(0.3)(x)

    output = Dense(1, activation='relu', name='Activation')(x)

    model = Model([user_input, movie_input, genre_input], output)

    # Compile the model with the optimizer and loss
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    else:
        opt = SGD(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='mean_squared_error', metrics=[RootMeanSquaredError()])

    return model

In [42]:
# Extract the best hyperparameters from the `best` dictionary
best_n_latent_factors_movie = int(best['n_latent_factors_movie'])
best_n_latent_factors_user = int(best['n_latent_factors_user'])
best_activation = ['relu', 'tanh', 'sigmoid'][best['activation']]
best_optimizer = ['adam', 'sgd'][best['optimizer']]
best_learning_rate = best['learning_rate']
best_num_hidden_layers = int(best['num_hidden_layers'])
best_num_neurons_per_layer = int(best['num_neurons_per_layer'])
best_num_epochs = int(best['num_epochs'])
best_batch_size = int(best['batch_size'])

# Define the model using the best hyperparameters
best_model = create_model_with_hyperparameters(
    best_n_latent_factors_movie,
    best_n_latent_factors_user,
    best_activation,
    best_optimizer,
    best_learning_rate,
    best_num_hidden_layers,
    best_num_neurons_per_layer
)

# Train the model using the best hyperparameters and your training data
history = best_model.fit(
        [user_ids, movie_ids, genre_data],
        ratings,
        validation_data=([user_val_ids, movie_val_ids, genre_val_data], ratings_val),
        batch_size=best_batch_size,
        epochs=best_num_epochs,
        verbose=2,
        callbacks=[early_stopping, checkpointer, reduce_lr]
    )



Epoch 1/150

Epoch 1: val_root_mean_squared_error did not improve from 0.93532
830/830 - 4s - loss: 1.5919 - root_mean_squared_error: 1.1879 - val_loss: 1.3428 - val_root_mean_squared_error: 1.1025 - lr: 0.0213 - 4s/epoch - 4ms/step
Epoch 2/150

Epoch 2: val_root_mean_squared_error did not improve from 0.93532
830/830 - 3s - loss: 1.3192 - root_mean_squared_error: 1.1061 - val_loss: 1.2558 - val_root_mean_squared_error: 1.0880 - lr: 0.0213 - 3s/epoch - 4ms/step
Epoch 3/150

Epoch 3: val_root_mean_squared_error did not improve from 0.93532
830/830 - 2s - loss: 1.2269 - root_mean_squared_error: 1.0798 - val_loss: 1.1581 - val_root_mean_squared_error: 1.0504 - lr: 0.0213 - 2s/epoch - 3ms/step
Epoch 4/150

Epoch 4: val_root_mean_squared_error did not improve from 0.93532
830/830 - 2s - loss: 1.1223 - root_mean_squared_error: 1.0326 - val_loss: 1.0628 - val_root_mean_squared_error: 1.0023 - lr: 0.0213 - 2s/epoch - 3ms/step
Epoch 5/150

Epoch 5: val_root_mean_squared_error did not improve fr

In [43]:
# Evaluate the model's performance on your test dataset
test_loss = best_model.evaluate([user_test_ids, movie_test_ids, genre_test_data], ratings_test, verbose=0)
print("Test loss:", test_loss)

Test loss: [0.9334442019462585, 0.9450812935829163]


In [48]:
best_model.load_weights('weights.hdf5')
# Evaluate the model's performance on your test dataset
test_loss = best_model.evaluate([user_test_ids, movie_test_ids, genre_test_data], ratings_test, verbose=0)
print("Test loss:", test_loss)

Test loss: [0.9185221791267395, 0.9352539777755737]


# Embeddings

In [49]:
movie_embeddings_layer = best_model.layers[2]
user_embeddings_layer = best_model.layers[3]

In [50]:
movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [51]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]
movie_embeddings_matrix[0], user_embeddings_matrix.shape

(array([ 6.9582646e-09, -4.7056737e-08, -2.7332101e-08, -2.4231946e-08,
         8.3327363e-09,  2.8111511e-08,  4.9143583e-08,  4.6590085e-08,
        -4.6067690e-08], dtype=float32),
 (944, 15))

In [57]:
np.save('best_model/movie_embeddings_matrix.npy', movie_embeddings_matrix)
np.save('best_model/user_embeddings_matrix.npy', user_embeddings_matrix)
np.save('best_model/user2Idx.npy', user2Idx)
np.save('best_model/movie2Idx.npy', movie2Idx)
best_model.save('best_model/model.keras')
