### TC4034.10 Análisis de grandes volúmenes de datos

#### Equipo 13


* Hansel Zapiain Rodríguez (A00469031)
* Miguel Guillermo Galindo Orozco (A01793695)
* Francisco José Arellano Montes (A01794283)

**9.5 Avance de proyecto 4: Sistema de Recomendación**
   
Junio 2024

**Objetivos**


El desarrollo de esta actividad contribuye al cumplimiento de los objetivos del Modulo 5:

Aplicar algoritmos de machine learning a big data enfocado al modelado predictivo y toma de decisiones basada en datos.
Identificar la intersección entre Big Data e Inteligencia Artificial.
Reconocer la aplicación de algoritmos de machine learning en el análisis de Big Data

**Insciones**


onesEn esta entrega es necesario realizar un reporte donde se enlisten los siguientes aspectos:

Implementación final de sistemas de recomendación. Integra la evidencia en GitHub de los algoritmos desarrollados en los avances 4.2 y/o 6.2.
Evaluación integral del desempeño de los modelos utilizando varias métricas. Recuerda integrar la evidencia en el repositorio GitHub del equipo.
Documentación del código base y algoritmos implementados. Entregar en el documento word/pdf en Canvas.po).

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
import itertools
import matplotlib.pyplot as plt
import os
import sys
import findspark
import implicit
import cudf

from datasets import load_dataset

from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit, udf, expr, row_number, collect_set
from pyspark.sql.types import FloatType, IntegerType

from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer, VectorAssembler, Normalizer
from pyspark.ml.linalg import Vectors, DenseVector
from pyspark.ml.evaluation import RegressionEvaluator

print("All libraries imported successfully.")

In [None]:
dataset = load_dataset('McAuley-Lab/Amazon-Reviews-2023', 'raw_review_All_Beauty', download_mode='force_redownload', trust_remote_code=True)

In [None]:
df_amz = dataset['full'].to_pandas()
df_amz.head()

In [None]:
gdf = cudf.DataFrame.from_pandas(df_amz)

In [None]:
pdf_result = gdf.to_pandas()

## Pyspark Setup

In [None]:
findspark.init()
findspark.find()

In [None]:
log4j_properties = '/app/config/log4j.properties'

In [None]:
spark = SparkSession.builder \
    .appName('Amazon Reviews Recommender with GPU') \
    .config('spark.sql.execution.arrow.pyspark.enabled', 'true') \
    .config('spark.driver.memory', '16g') \
    .config('spark.executor.memory', '16g') \
    .config('spark.sql.shuffle.partitions', '200') \
    .config('spark.memory.fraction', '0.8') \
    .config('spark.memory.storageFraction', '0.5') \
    .config('spark.master', 'local[*]') \
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
    .config('spark.driver.extraJavaOptions', f'-Dlog4j.configuration=file:{log4j_properties}') \
    .config('spark.executor.extraJavaOptions', f'-Dlog4j.configuration=file:{log4j_properties}') \
    .config('spark.kryoserializer.buffer.max', '2024m') \
    .config('spark.broadcast.compress', 'true') \
    .config('spark.shuffle.compress', 'true') \
    .config('spark.shuffle.spill.compress', 'true') \
    .config('spark.sql.autoBroadcastJoinThreshold', -1) \
    .getOrCreate()
print('Spark Session Created:', spark)

In [None]:
sc = SparkContext.getOrCreate()
sc.setLogLevel("WARN")

In [None]:
spark

In [None]:
df_amz_spark = pdf_result.sample(frac = 0.10, replace = False, random_state = 1234)
df_amz_spark.head()

In [None]:
spark_df = spark.createDataFrame(df_amz_spark[['user_id', 'asin', 'rating']])

In [None]:
spark_df = spark_df.select(col('user_id').alias('user_id'), col('asin').alias('asin'), col('rating').alias('rating'))

user_indexer = StringIndexer(inputCol = 'user_id', outputCol = 'userIndex')
item_indexer = StringIndexer(inputCol = 'asin', outputCol = 'itemIndex')

indexed_df = user_indexer.fit(spark_df).transform(spark_df)
indexed_df = item_indexer.fit(indexed_df).transform(indexed_df)

In [None]:
indexed_df.select('user_id').show(truncate = False)

In [None]:
indexed_df.count()

In [None]:
indexed_df.printSchema()

## ALS (PySpark)

In [None]:
(training_df, test_df) = indexed_df.randomSplit([0.8, 0.2])
training_df.cache()
test_df.cache()

In [None]:
als = ALS(userCol = 'userIndex', itemCol = 'itemIndex', ratingCol = 'rating', coldStartStrategy ='drop', nonnegative = True)
model = als.fit(training_df)

In [None]:
item_mapping_df = indexed_df.select('asin', 'itemIndex').distinct()

In [None]:
def get_top_n_recommendations_als_spark(user_id, model, item_mapping_df, n):
    user_index_row = indexed_df.filter(col('user_id') == user_id).select('userIndex').first()
    if not user_index_row:
        raise ValueError(f"User ID {user_id} not found.")
    user_index = user_index_row['userIndex']
    
    user_df = spark.createDataFrame([(user_index,)], ['userIndex'])
    user_recommendations = model.recommendForUserSubset(user_df, n)
    
    recommended_items = user_recommendations.withColumn("itemIndex", F.explode("recommendations.itemIndex")) \
        .select("itemIndex").distinct() \
        .join(item_mapping_df, "itemIndex").select("asin").collect()
    
    return [row.asin for row in recommended_items]

In [None]:
def evaluate_rmse(model, test_df):
    predictions = model.transform(test_df)
    evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
    rmse = evaluator.evaluate(predictions)
    return rmse

In [None]:
def precision_recall_for_user(user_id, actual_items_dict, recommended_items_dict):
    actual_items = actual_items_dict.get(user_id, set())
    recommended_items = recommended_items_dict.get(user_id, set())

    num_relevant_items = len(actual_items)
    num_recommended_items = len(recommended_items)
    num_relevant_and_recommended = len(actual_items & recommended_items)

    precision = num_relevant_and_recommended / num_recommended_items if num_recommended_items > 0 else 0.0
    recall = num_relevant_and_recommended / num_relevant_items if num_relevant_items > 0 else 0.0

    return precision, recall

In [None]:
def evaluate_precision_recall(model, test_df, item_mapping_df, n):
    user_ids = [row.user_id for row in test_df.select('user_id').distinct().collect()]

    actual_items_dict = test_df.groupBy('user_id').agg(collect_set('asin').alias('actual_items')).rdd.collectAsMap()
    recommended_items_dict = {user_id: set(get_top_n_recommendations_als_spark(user_id, model, item_mapping_df, n)) for user_id in user_ids}

    results = [precision_recall_for_user(user_id, actual_items_dict, recommended_items_dict) for user_id in user_ids]

    total_users = len(results)
    avg_precision = sum(x[0] for x in results) / total_users
    avg_recall = sum(x[1] for x in results) / total_users

    return avg_precision, avg_recall

In [None]:
n = 10
user_id = 'AGI5JHRJ2PSK5BURGV762KIG2Y5A'
top_n_recommendations = get_top_n_recommendations_als_spark(user_id, model, item_mapping_df, n)
print(f'Top {n} recommendations for user {user_id}: {top_n_recommendations}')

In [None]:
rmse = evaluate_rmse(model, test_df)
print(f'Root-mean-square error = {rmse}')

In [None]:
avg_precision, avg_recall = evaluate_precision_recall(model, test_df, item_mapping_df, n)
print(f'Average Precision = {avg_precision}')
print(f'Average Recall = {avg_recall}')