In [74]:
import urllib.request

import math
import numpy as np

from keras.models import Model
from keras.layers import Embedding, Flatten, Input, Dense, Concatenate, Dot

import pandas as pd
from sklearn.model_selection import train_test_split
import random

def process_dataframe_according_to_item_and_user(df, n_items, n_users):
    # Convertir a tipos de datos adecuados
    df['rating'] = df['rating'].astype(int)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

    # Filtrar por ítems que han sido valorados al menos n_items veces
    item_counts = df['item'].value_counts()
    valid_items = item_counts[item_counts > n_items].index
    df_filtered = df[df.item.isin(valid_items)]

    # Filtrar por usuarios que han comprado al menos n_users veces algún ítem válido
    user_counts = df_filtered['user'].value_counts()
    valid_users = user_counts[user_counts > n_users].index
    df_filtered = df_filtered[df_filtered.user.isin(valid_users)]

    # Ordenar por timestamp
    df_filtered.sort_values(by='timestamp', inplace=True)

    return df_filtered

df_sports = pd.read_csv('./DATOS/datasets_clase/All_Beauty.csv', names = ['item','user','rating','timestamp'])
df_sports = process_dataframe_according_to_item_and_user(df_sports, 5, 5)
df_sports_dict = df_sports.to_dict(orient='list')
parser_sports_usuarios = {user: i for i, user in enumerate(df_sports['user'].unique())}
parser_sports_items = {item: i for i, item in enumerate(df_sports['item'].unique())}
len(parser_sports_items), len(parser_sports_usuarios)



(658, 355)

In [75]:
"""
    Hacemos la comparacion con NCF solo y los dos modelos combinados, se combina el modelo ya que usamos la entrada del modelo recomendador para predecir solamente aquellos que hayamos recomendado
    Es interesante para saber que tan buenas han sido las recomendaciones que hemos ofrecido al usuario y con ello logramos sacar las métricas necesarias para esta práctica
"""

'\n    Hacemos la comparacion con NCF solo y los dos modelos combinados, se combina el modelo ya que usamos la entrada del modelo recomendador para predecir solamente aquellos que hayamos recomendado\n    Es interesante para saber que tan buenas han sido las recomendaciones que hemos ofrecido al usuario y con ello logramos sacar las métricas necesarias para esta práctica\n'

In [76]:
df_sports['user'], user_codes = pd.factorize(df_sports['user'])
df_sports['item'], item_codes = pd.factorize(df_sports['item'])

NUM_ITEMS = len(item_codes)
NUM_USERS = len(user_codes)

ratings_todos = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]


for i, u, rating in df_sports[['item', 'user', 'rating']].itertuples(index=False):
    ratings_todos[u][i] = rating

In [77]:

df_todos = pd.DataFrame(ratings_todos)

# Reordenamos las columnas de forma aleatoria
df_todos = df_todos.sample(frac=1, axis=1, random_state=42)


# Partimos el dataset para lograr que ratings y test_ratings tengan el mismo tamaño
# En lo unico en que difieren es en la asignación de None, en entrenameinto la parte de test está asignado con None y en test justamente lo contrario
train_data, test_data = train_test_split(df_todos, test_size=0.5)

test_data_copy = pd.concat([train_data.copy(), test_data.copy()])
test_data_copy[:] = None

num_users_for_test = len(test_data)
train_data = pd.concat([train_data, test_data.copy()])


train_data.iloc[num_users_for_test, NUM_ITEMS//2:] = None
try:
    train_data.iloc[num_users_for_test:, :NUM_ITEMS//2] = test_data.iloc[:, :NUM_ITEMS//2]
except:
    train_data.iloc[num_users_for_test-1:, :NUM_ITEMS//2] = test_data.iloc[:, :NUM_ITEMS//2]

test_data_copy.iloc[:, NUM_ITEMS//2:] = test_data.iloc[:, NUM_ITEMS//2:]


# Convertir los DataFrames en listas de listas con valores NaN reemplazados por None
ratings = train_data.where(pd.notna(train_data), None).values.tolist()
test_ratings = test_data_copy.where(pd.notna(test_data_copy), None).values.tolist()

# Teniamos valores nan y none mezclados y para mantenerlo en el mismo formato nos decantamos con nones
ratings = [[valor if valor in (1, 2, 3, 4, 5) else None for valor in lista] for lista in ratings]
test_ratings = [[valor if valor in (1, 2, 3, 4, 5) else None for valor in lista] for lista in test_ratings]

In [78]:
X_train = [np.array([], dtype=int), np.array([], dtype=int)]
y_train = np.array([], dtype=int)

for u in range(len(ratings)):
  for i in range(len(ratings[u])):
    if ratings[u][i] != None:
        X_train[0] = np.append(X_train[0], int(u))
        X_train[1] = np.append(X_train[1], int(i))
        y_train = np.append(y_train, int(ratings[u][i]))

In [79]:
X_test = [np.array([], dtype=int), np.array([], dtype=int)]
y_test = np.array([], dtype=int)

for u in range(len(test_ratings)):
  for i in range(len(test_ratings[u])):
    if test_ratings[u][i] != None:
        X_test[0] = np.append(X_test[0], int(u))
        X_test[1] = np.append(X_test[1], int(i))
        y_test = np.append(y_test, int(test_ratings[u][i]))

In [80]:
pred_list = np.loadtxt('pred_list.out') # PROVIENE DE LAS RECOMENDACIONES HECHAS DEL MODELO DLFS-Rec
lista =  pred_list.argsort().argsort()[:, :20]

In [81]:
pivot_table = df_sports.pivot_table(index='user', columns='item', values='rating')

In [82]:
latent_dim = 5
epochs = 50

In [83]:
user_input = Input(shape=[1])
user_embedding = Embedding(NUM_USERS, latent_dim)(user_input)
user_vec = Flatten()(user_embedding)

item_input = Input(shape=[1])
item_embedding = Embedding(NUM_ITEMS, latent_dim)(item_input)
item_vec = Flatten()(item_embedding)

concat = Concatenate(axis=1)([user_vec, item_vec])
d1 = Dense(20, activation='relu')(concat)
d2 = Dense(10, activation='relu')(d1)
output = Dense(1, activation='relu')(d2)

MLP = Model([user_input, item_input], output)

In [84]:
from keras.utils import to_categorical

# Convertir etiquetas a one-hot encoding
y_train_one_hot = to_categorical(y_train, num_classes=6) -1

In [85]:
MLP.compile(optimizer='adam', metrics=['mae'], loss='mean_squared_error')
MLP.summary()
MLP.fit(X_train, y_train, epochs=epochs, verbose=1)

Epoch 1/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 807us/step - loss: 20.9226 - mae: 4.4918 
Epoch 2/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 704us/step - loss: 17.1283 - mae: 4.0425
Epoch 3/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 581us/step - loss: 6.9380 - mae: 2.4422
Epoch 4/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 557us/step - loss: 1.3371 - mae: 0.9477
Epoch 5/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step - loss: 0.6524 - mae: 0.6196
Epoch 6/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 547us/step - loss: 0.4568 - mae: 0.4700
Epoch 7/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 526us/step - loss: 0.3798 - mae: 0.4054
Epoch 8/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 546us/step - loss: 0.2931 - mae: 0.3393
Epoch 9/50
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56

<keras.src.callbacks.history.History at 0x7fc4dd1efad0>

In [86]:
y_pred = MLP.predict(X_test)
y_pred = np.round(y_pred).astype(int)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [87]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score, f1_score, ndcg_score

# Métricas de clasificación
print("Precision Score:", precision_score(y_test, y_pred.round(), average='weighted'))
print("Recall Score:", recall_score(y_test, y_pred.round(), average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred.round(), average='weighted'))


Precision Score: 0.9246242345025533
Recall Score: 0.9087048832271762
F1 Score: 0.9136383896217254


  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
# Calcular NDCG para cada clase y promediar los resultados
y_pred_classes = np.argmax(y_pred, axis=1)
# Calcular NDCG para cada clase y promediar los resultados
ndcg_scores = []
for class_label in range(6):
    y_test_class = (y_test == class_label).astype(int)
    y_pred_class = (y_pred_classes == class_label).astype(int)
    ndcg_class = ndcg_score([y_test_class], [y_pred_class])
    ndcg_scores.append(ndcg_class)

# Calcular el promedio de los NDCG scores para todas las clases
average_ndcg = np.mean(ndcg_scores)

print("Average NDCG Score:", average_ndcg)

Average NDCG Score: 0.4497545149631164


In [89]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score, f1_score, ndcg_score
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))


0.09766454352441614
0.11040339702760085


In [90]:
X_train = [np.array([], dtype=int), np.array([], dtype=int)]
y_train = np.array([], dtype=int)
X_test = [np.array([], dtype=int), np.array([], dtype=int)]
y_test = np.array([], dtype=int)

for index, row in pivot_table.iterrows():
    for indice, columna in enumerate(row):
        if columna in (1.0, 2.0, 3.0, 4.0, 5.0):
            X_train[0] = np.append(X_train[0], index)
            X_train[1] = np.append(X_train[1], indice)
            y_train = np.append(y_train, columna)

In [91]:


for u in range(len(lista)):
  for i in range(len(lista[u])):
    if pivot_table.iloc[u%NUM_USERS, i%NUM_ITEMS] in (1.0, 2.0, 3.0, 4.0, 5.0):
            # Verificar si el dato de prueba ya está en los datos de entrenamiento
            if (u % NUM_USERS in X_train[0]) and (i % NUM_ITEMS in X_train[1]):
                # Si ya está en los datos de entrenamiento, lo eliminamos de allí
                idx_train = (X_train[0] == u % NUM_USERS) & (X_train[1] == i % NUM_ITEMS)
                X_train[0] = np.delete(X_train[0], np.where(idx_train))
                X_train[1] = np.delete(X_train[1], np.where(idx_train))
                y_train = np.delete(y_train, np.where(idx_train))
            # Agregamos los datos de prueba
            X_test[0] = np.append(X_test[0], int(u % NUM_USERS))
            X_test[1] = np.append(X_test[1], int(i % NUM_ITEMS))
            y_test = np.append(y_test, int(pivot_table.iloc[u % NUM_USERS, i % NUM_ITEMS]))


In [92]:
latent_dim = 5
epochs = 50

In [93]:
user_input = Input(shape=[1])
user_embedding = Embedding(NUM_USERS, latent_dim)(user_input)
user_vec = Flatten()(user_embedding)

item_input = Input(shape=[1])
item_embedding = Embedding(NUM_ITEMS, latent_dim)(item_input)
item_vec = Flatten()(item_embedding)

concat = Concatenate(axis=1)([user_vec, item_vec])
d1 = Dense(20, activation='relu')(concat)
d2 = Dense(10, activation='relu')(d1)
output = Dense(1, activation='relu')(d2)

MLP = Model([user_input, item_input], output)

In [94]:
MLP.compile(optimizer='adam', metrics=['mae'], loss='mean_squared_error')
MLP.summary()
MLP.fit(X_train, y_train, epochs=epochs, verbose=1)

Epoch 1/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 641us/step - loss: 20.4698 - mae: 4.4359 
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 643us/step - loss: 14.7938 - mae: 3.7378
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 624us/step - loss: 3.9850 - mae: 1.7571
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 625us/step - loss: 1.1235 - mae: 0.8793
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - loss: 0.5603 - mae: 0.5671
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step - loss: 0.4281 - mae: 0.4528
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 760us/step - loss: 0.3147 - mae: 0.3780
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - loss: 0.3072 - mae: 0.3693
Epoch 9/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72

<keras.src.callbacks.history.History at 0x7fc4d7e2e410>

In [95]:
import numpy as np

y_pred = MLP.predict(X_test)
y_pred = np.round(y_pred).astype(int)

y_pred

[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step


array([[4],
       [4],
       [3],
       ...,
       [3],
       [3],
       [3]])

In [96]:
# Métricas de clasificación
print("Precision Score:", precision_score(y_test, y_pred.round(), average='weighted'))
print("Recall Score:", recall_score(y_test, y_pred.round(), average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred.round(), average='weighted'))


Precision Score: 0.9116690952785133
Recall Score: 0.04792416860834569
F1 Score: 0.053354425637053435


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
len(y_test)

18884

In [98]:
# Calcular NDCG para cada clase y promediar los resultados
y_pred_classes = np.argmax(y_pred, axis=1)
# Calcular NDCG para cada clase y promediar los resultados
ndcg_scores = []
for class_label in range(6):
    y_test_class = (y_test == class_label).astype(int)
    y_pred_class = (y_pred_classes == class_label).astype(int)
    ndcg_class = ndcg_score([y_test_class], [y_pred_class])
    ndcg_scores.append(ndcg_class)

# Calcular el promedio de los NDCG scores para todas las clases
average_ndcg = np.mean(ndcg_scores)

print("Average NDCG Score:", average_ndcg)

Average NDCG Score: 0.5160472492405047


In [99]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score, f1_score, ndcg_score
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))


1.7438572336369413
3.4229506460495656
