# Imports usados nesse notebook

In [1]:
import numpy as np
from numpy import ma
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import keras
from api.core.v1.recommendation.ml.utils import generate_subset_movies_not_rated, load_data, gen_user_vecs, sq_dist, get_pred_movies, genres

# Carregar os dados

In [2]:

item_set, user_set, y_set, movies_dict, df_movie_rating, df_movie_rating_user_avg = load_data()


INFO:api.core.v1.recommendation.ml.utils:loading and transforming data...
INFO:api.core.v1.recommendation.ml.utils:done loading data.


# Transform dos dados, parametrização e treino do modelo

In [3]:
# configs

num_user_features = user_set.shape[1] - 3  # remover userid, rating count and ave rating durante o treino
num_item_features = item_set.shape[1] - 1  # remover movie id durante o treino

# indices auxiliares
user_vector_start = 3
item_vector_start = 3
user_columns_start = 3
item_columns_start = 1

In [4]:
# transformar os dados

scalerItem = StandardScaler()
scalerItem.fit(item_set)
item_set = scalerItem.transform(item_set)

scalerUser = StandardScaler()
scalerUser.fit(user_set)
user_set = scalerUser.transform(user_set)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_set.reshape(-1, 1))
y_set = scalerTarget.transform(y_set.reshape(-1, 1))


In [5]:
# separar em treino e teste

item_train, item_test = train_test_split(item_set, train_size=0.70, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_set, train_size=0.70, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_set, train_size=0.70, shuffle=True, random_state=1)

In [6]:
# definicao das NNs

# criar os inputs
input_user = keras.layers.Input(shape=(num_user_features,))
input_item = keras.layers.Input(shape=(num_item_features,))


user_NN = keras.models.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(32, activation='linear'),
])

item_NN = keras.models.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(32, activation='linear'),
])


# direcionar para as NNs
vu = user_NN(input_user)
vu = keras.layers.LayerNormalization(axis=1)(vu)

vm = item_NN(input_item)
vm = keras.layers.LayerNormalization(axis=1)(vm)

output = keras.layers.Dot(axes=1)([vu, vm])

# especificar o input e o output do modelo
model = keras.Model([input_user, input_item], output)

# hiperparametros
cost_fn = keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.1)

model.compile(optimizer=opt,
            loss=cost_fn)




In [8]:
model.summary()

In [7]:
model.fit([user_train[:, user_columns_start:], item_train[:, item_columns_start:]], y_train, epochs=30)

Epoch 1/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 30.1212
Epoch 2/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5111
Epoch 3/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5281
Epoch 4/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4955
Epoch 5/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4928
Epoch 6/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5161
Epoch 7/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5052
Epoch 8/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4930
Epoch 9/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5052
Epoch 10/30
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - l

<keras.src.callbacks.history.History at 0x216cd413460>

In [9]:
model.evaluate([user_test[:, user_columns_start:], item_test[:, item_columns_start:]], y_test)

[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 692us/step - loss: 0.4997


0.5144850611686707

# Predict de um item baseado em um novo user

In [8]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

new_user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [11]:
item_vecs = generate_subset_movies_not_rated(new_user_id, df_movie_rating, df_movie_rating_user_avg, min_imdb_rating=3.0)

# gerar o vetor de users para ser do mesmo tamanho do item_vecs
user_vecs = gen_user_vecs(new_user_vec, len(item_vecs))

# transformar os valores
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# predição
y_p = model.predict([suser_vecs[:, user_columns_start:], sitem_vecs[:, item_columns_start:]])
y_pu = scalerTarget.inverse_transform(y_p)


sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()
sorted_ypu = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]


df_new_user = get_pred_movies(sorted_ypu, sorted_items, movies_dict, maxcount = 50)
df_new_user.head()

[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 822us/step


Unnamed: 0,y_p,movie id,rating ave,title,genres
0,4.4,3471,2.4,Close Encounters of the Third Kind (1977),Adventure|Drama|Sci-Fi
1,4.4,1306,2.5,Until the End of the World (Bis ans Ende der W...,Adventure|Drama|Sci-Fi
2,4.4,4370,2.5,A.I. Artificial Intelligence (2001),Adventure|Drama|Sci-Fi
3,4.4,924,2.3,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
4,4.3,8591,2.2,"Philadelphia Experiment, The (1984)",Adventure|Drama|Sci-Fi


In [16]:
df_new_user[["title", "genres"]].to_dict()

{'title': {0: 'Close Encounters of the Third Kind (1977)',
  1: 'Until the End of the World (Bis ans Ende der Welt) (1991)',
  2: 'A.I. Artificial Intelligence (2001)',
  3: '2001: A Space Odyssey (1968)',
  4: 'Philadelphia Experiment, The (1984)',
  5: 'Star Trek: Generations (1994)',
  6: 'WarGames (1983)',
  7: 'Millennium (1989)',
  8: "Man Escaped, A (Un  condamné à mort s'est échappé ou Le vent souffle où il veut) (1956)",
  9: 'Lawrence of Arabia (1962)',
  10: 'Fountain, The (2006)',
  11: 'Almost Heroes (1998)',
  12: 'Beautiful Creatures (2013)',
  13: 'Walkabout (1971)',
  14: 'Nothing But Trouble (1991)',
  15: 'Like Water for Chocolate (Como agua para chocolate) (1992)',
  16: 'Gattaca (1997)',
  17: 'Beastly (2011)',
  18: 'About Time (2013)',
  19: 'Captains Courageous (1937)',
  20: 'Dersu Uzala (1975)',
  21: 'Double Life of Veronique, The (Double Vie de Véronique, La) (1991)',
  22: 'Let the Right One In (Låt den rätte komma in) (2008)',
  23: 'Big Fish (2003)',
  24

# Predict de um item baseado em um user existente

In [40]:
user_id = 52
# agrupa os filmes avaliados com os generos
df_movie_rating_user_avg_genres = df_movie_rating.groupby(["rater_id", "title"] + genres)["rating"].mean()
df_movie_rating_user_avg_genres = df_movie_rating_user_avg_genres.loc[user_id].reset_index()
rating_count = df_movie_rating_user_avg_genres.value_counts().sum()
rating_ave = df_movie_rating_user_avg_genres["rating"].mean()
user_vec = [user_id, rating_count, rating_ave] + [df_movie_rating_user_avg_genres[df_movie_rating_user_avg_genres[genre] == 1]["rating"].mean() for genre in genres]

item_vecs = generate_subset_movies_not_rated(user_id, df_movie_rating, df_movie_rating_user_avg, min_imdb_rating=3.0)

# gerar o vetor de users para ser do mesmo tamanho do item_vecs
user_vecs = gen_user_vecs(user_vec, len(item_vecs))

# transformar os valores
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# predição
y_p = model.predict([suser_vecs[:, user_columns_start:], sitem_vecs[:, item_columns_start:]])
y_pu = scalerTarget.inverse_transform(y_p)


sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]


df = get_pred_movies(sorted_ypu, sorted_items, movies_dict, maxcount = 50)
df

[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


Unnamed: 0,y_p,movie id,rating ave,title,genres
0,4.1,97938,2.3,Life of Pi (2012),Adventure|Drama|IMAX
1,4.1,80839,2.3,Secretariat (2010),Adventure|Drama
2,4.1,105197,2.4,Nebraska (2013),Adventure|Drama
3,4.1,1615,2.3,"Edge, The (1997)",Adventure|Drama
4,4.1,155,2.3,Beyond Rangoon (1995),Adventure|Drama|War
5,4.1,35015,2.2,Duma (2005),Adventure|Drama
6,4.1,1027,2.4,Robin Hood: Prince of Thieves (1991),Adventure|Drama
7,4.1,5839,2.5,My Father's Glory (La gloire de mon père) (1990),Adventure|Drama
8,4.1,4534,2.4,Return to Snowy River (a.k.a. The Man From Sno...,Adventure|Drama|Western
9,4.1,110127,2.6,Noah (2014),Adventure|Drama|IMAX


# Predict de um item baseado em outro item

In [41]:

# gerar o feature vector do subset des filmes
input_item_m = keras.layers.Input(shape=(num_item_features,))
vm_m = item_NN(input_item_m)
vm_m = keras.layers.LayerNormalization(axis=1)(vm_m)
model_m = keras.Model(input_item_m, vm_m)

scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,item_columns_start:])

count = 50  # quantidade de filmes para mostrar
dim = len(vms)
dist = np.zeros((dim,dim))

# calcular proximidade entre os feature vectors usando square distance
for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])

m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(np.ceil(abs(item_train[i,0])))
    movie2_id = int(np.ceil(abs(item_train[min_idx,0])))
    disp.append( [movies_dict[movie1_id]['title'], movies_dict[movie1_id]['genres'],
                  movies_dict[movie2_id]['title'], movies_dict[movie1_id]['genres']]
               )
print(disp)

[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 726us/step
[['movie1', 'genres', 'movie2', 'genres'], ['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'], ['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'], ['Grumpier Old Men (1995)', 'Comedy|Romance', 'Toy Story (1995)', 'Comedy|Romance'], ['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'], ['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'], ['Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy', 'Jumanji (1995)', 'Adventure|Animation|Children|Comedy|Fantasy'], ['Jumanji (1995)', 'Adventure|Children|Fantasy', 'Toy Story (1995)', 'Adventure|Children|Fantasy'], ['Jumanji (1995)'