# Paso 1: Importar Librerías de Python


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [None]:
drive.mount('/content/drive')

# Paso 2: Lectura del Dataset

In [None]:
# Suponiendo que tu DataFrame se llama df
df = pd.read_csv('/content/drive/MyDrive/EspecializacionA&DS/Monografia/2doSemestre/DataFinal_Amazon.csv')  # Descomenta y proporciona la ruta si estás cargando un archivo CSV

# Paso 3: Reducción de Datos


In [None]:
# Tomar una muestra del 10% del dataset, si es necesario
#df_sample = df.sample(frac=0.1, random_state=42)

# Paso 4: Ingeniería de Características (Feature Engineering)

In [None]:
# Convert 'reviewDate' to datetime format
df['reviewDate'] = pd.to_datetime(df['reviewDate'])

# Create new temporal columns
df['year'] = df['reviewDate'].dt.year
df['month'] = df['reviewDate'].dt.month
df['day'] = df['reviewDate'].dt.day
df['weekday'] = df['reviewDate'].dt.weekday

print(df[['reviewDate', 'year', 'month', 'day', 'weekday']].head())

# Paso 5: Creación de Características


In [None]:
#A modo de ejemplo, supongamos que deseamos conocer la longitud del texto de revisión:
df['reviewText_length'] = df['reviewText'].apply(lambda x: len(str(x)))

# Paso 6: Limpieza/Transformación de Datos


In [None]:
# Eliminemos posibles duplicados:
df = df.drop_duplicates()

# Paso 7: Análisis Exploratorio de Datos (EDA)

In [None]:
#Visualicemos la distribución de las calificaciones (overall):
#sns.countplot(data=df, x='overall')
#plt.title('Distribution of Overall Ratings')
#plt.show()


# Paso 8: Resumen Estadístico


In [None]:
#print(df.describe())

# Paso 9: Análisis Univariado


In [None]:
#Histograma de precios:
#sns.histplot(df['price'])
#plt.title('Price Distribution')
#plt.show()


# Paso 10: Transformación de Datos

In [None]:
# Normalizar la columna de precios si es necesario:
df['price_normalized'] = (df['price'] - df['price'].mean()) / df['price'].std()

# Paso 11: Análisis Bivariado


In [None]:
#Relación entre overall y price:
#sns.scatterplot(data=df, x='price', y='overall')
#plt.title('Price vs Overall Rating')
#plt.show()


# Paso 12: Análisis Multivariado


In [None]:
#Relación entre overall, price y reviewText_length:
#sns.pairplot(df[['overall', 'price', 'reviewText_length']])
#plt.show()


# Paso 13: Imputar Valores Faltantes

In [None]:
# Reemplazando valores faltantes en 'price' con la mediana
#df['price'].fillna(df['price'].median(), inplace=True)

# Si hay valores de texto faltantes, los reemplazamos con una cadena vacía
#df['reviewText'].fillna('', inplace=True)
#df['summary'].fillna('', inplace=True)


# Modelo 1 de Prueba

Modelo de filtrado colaborativo utilizando Keras:

In [None]:
data = df

In [None]:
from sklearn.preprocessing import LabelEncoder

# Crear un LabelEncoder para cada columna
reviewerID_encoder = LabelEncoder()
asin_encoder = LabelEncoder()

# Ajustar y transformar las columnas
data['reviewerID_encoded'] = reviewerID_encoder.fit_transform(data['reviewerID'])
data['asin_encoded'] = asin_encoder.fit_transform(data['asin'])

# Mostrar las primeras filas del DataFrame para verificar
print(data[['reviewerID', 'reviewerID_encoded', 'asin', 'asin_encoded']].head())


       reviewerID  reviewerID_encoded        asin  asin_encoded
0  A1BB77SEBQT8VX              121313  B00007GDFV            39
1  A1BB77SEBQT8VX              121313  B00007GDFV            39
2   AHWOW7D1ABO9C             1265535  B00007GDFV            39
3   AHWOW7D1ABO9C             1265535  B00007GDFV            39
4   AKS3GULZE0HFC             1296109  B00007GDFV            39


In [None]:
import numpy as np
from keras.models import Model
from keras.layers import Embedding, Input, Dot, Flatten, Dense

# Número de usuarios únicos y número de ítems únicos
n_users = len(np.unique(data['reviewerID_encoded']))
n_items = len(np.unique(data['asin_encoded']))

# Dimensiones del embedding
embedding_dim = 10

# Entradas
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))

# Embeddings
user_embedding = Embedding(n_users, embedding_dim)(user_input)
item_embedding = Embedding(n_items, embedding_dim)(item_input)

# Producto punto para predecir la valoración/rating
merged = Dot(axes=2)([user_embedding, item_embedding])
merged = Flatten()(merged)

# Modelo
model = Model(inputs=[user_input, item_input], outputs=merged)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])




verbose=0: No muestra ninguna barra de progreso ni métricas.

verbose=1: Muestra una barra de progreso y actualiza las métricas después de cada lote.

verbose=2: Muestra las métricas después de cada época, pero no muestra la barra de progreso.

In [None]:
# Entrenamiento
user_ids = data['reviewerID_encoded'].values
item_ids = data['asin_encoded'].values
ratings = data['overall'].values
model.fit([user_ids, item_ids], ratings, epochs=3, batch_size=4096, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f805d9a1a50>

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

predictions = model.predict([user_ids, item_ids])




In [None]:
# MSE
mse = mean_squared_error(ratings, predictions)
print(f"MSE: {mse:.4f}")

# RMSE
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

# MAE
mae = mean_absolute_error(ratings, predictions)
print(f"MAE: {mae:.4f}")

# MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((ratings - predictions.flatten()) / ratings)) * 100
print(f"MAPE: {mape:.2f}%")


MSE: 15.9169
RMSE: 3.9896
MAE: 3.7271
MAPE: 86.76%


In [None]:
# https://www.aprendemachinelearning.com/sistemas-de-recomendacion/

In [None]:
# Entrenamiento
user_ids = data['reviewerID_encoded'].values
item_ids = data['asin_encoded'].values
ratings = data['overall'].values
model.fit([user_ids, item_ids], ratings, epochs=1, batch_size=2048, verbose=1)



<keras.src.callbacks.History at 0x7f805ef5c1f0>

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

predictions = model.predict([user_ids, item_ids])




In [None]:
# MSE
mse = mean_squared_error(ratings, predictions)
print(f"MSE: {mse:.4f}")

# RMSE
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

# MAE
mae = mean_absolute_error(ratings, predictions)
print(f"MAE: {mae:.4f}")

# MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((ratings - predictions.flatten()) / ratings)) * 100
print(f"MAPE: {mape:.2f}%")


MSE: 12.6531
RMSE: 3.5571
MAE: 3.1998
MAPE: 73.87%


In [None]:
from sklearn.model_selection import train_test_split

# Crear los arrays con los datos
user_ids = data['reviewerID_encoded'].values
item_ids = data['asin_encoded'].values
ratings = data['overall'].values

# Dividir en train y test
(user_ids_train, user_ids_test,
 item_ids_train, item_ids_test,
 ratings_train, ratings_test) = train_test_split(user_ids, item_ids, ratings, test_size=0.3, random_state=42)




In [None]:
# Entrenar el modelo con el conjunto de entrenamiento
model.fit([user_ids_train, item_ids_train], ratings_train, epochs=5, batch_size=2048, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f805ef320b0>

In [None]:
# Evaluar el modelo con el conjunto de prueba (opcional)
loss, mae = model.evaluate([user_ids_test, item_ids_test], ratings_test, batch_size=2048)
print(f'Test Loss: {loss:.4f}')
print(f'Test MAE: {mae:.4f}')

Test Loss: 7.0447
Test MAE: 2.1847


In [None]:
predictions = model.predict([user_ids_test, item_ids_test])



In [None]:
# MSE
mse = mean_squared_error(ratings_test, predictions)
print(f"MSE: {mse:.4f}")

# RMSE
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")

# MAE
mae = mean_absolute_error(ratings_test, predictions)
print(f"MAE: {mae:.4f}")

# MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((ratings_test - predictions.flatten()) / ratings_test)) * 100
print(f"MAPE: {mape:.2f}%")


MSE: 7.0447
RMSE: 2.6542
MAE: 2.1847
MAPE: 52.60%
