# <center> <img src="./img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Modelo de Recomendacion** </center>

---
**Alumnos**: David Abraham Naranjo Salgado, Benjamin Zarate y Angel Cortes

In [17]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://873bad4e62fe:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

## Librerias

In [33]:
from team_name.spark_utils import SparkUtils
from pyspark.sql.functions import from_json, explode, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, ArrayType

# Preparación de Datos

In [34]:
parquet_path = "/home/jovyan/data"
df = spark.read.parquet(parquet_path)
df.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+--------------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|           value_str|
+----+--------------------+--------------------+---------+------+--------------------+-------------+--------------------+
|NULL|[7B 22 75 73 65 7...|kafka-spark-produ...|        0| 12683|2025-05-11 01:21:...|            0|{"userId": 1069, ...|
|NULL|[7B 22 75 73 65 7...|kafka-spark-produ...|        0| 12684|2025-05-11 01:21:...|            0|{"userId": 1069, ...|
|NULL|[7B 22 75 73 65 7...|kafka-spark-produ...|        0| 12685|2025-05-11 01:21:...|            0|{"userId": 1069, ...|
|NULL|[7B 22 75 73 65 7...|kafka-spark-produ...|        0| 12686|2025-05-11 01:21:...|            0|{"userId": 1069, ...|
|NULL|[7B 22 75 73 65 7...|kafka-spark-produ...|        0| 12687|2025-05-11 01:21:...|            0|{"userId": 1069, ...|
|NULL|[7B 22 75 73 65 7.

In [35]:
df.select("value_str").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+
|value_str                                                                                                                                         |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
|{"userId": 1069, "movieId": 965, "movieTitle": "The Azure Island", "genre": "Action", "rating": 8, "timestamp": 1746926499488}                    |
|{"userId": 1069, "movieId": 1121, "movieTitle": "Warlock From Titan", "genre": "Documentary", "rating": 9, "timestamp": 1746926499491}            |
|{"userId": 1069, "movieId": 1439, "movieTitle": "The Last Desert Of Blackwood Manor", "genre": "Comedy", "rating": 10, "timestamp": 1746926499494}|
|{"userId": 1069, "movieId": 1108, "movieTitle": "Prince Of The Golden Scrolls", "genre": "Fantasy", "rati

## Seleccionar columnas clave y limpiar 

In [36]:
schema = SparkUtils.generate_schema(
    [
        ("userId", "integer"),
        ("movieId", "integer"),
        ("movieTitle", "string"),
        ("rating", "float"),
    ]
)

# Parsear y expandir el array de objetos JSON
df_parsed = df.withColumn("json_array", from_json(col("value_str"), ArrayType(schema)))
ratings_df = df_parsed.withColumn("entry", explode("json_array")).select("entry.*")
ratings_df.show(5)

+------+-------+--------------------+------+
|userId|movieId|          movieTitle|rating|
+------+-------+--------------------+------+
|  1069|    965|    The Azure Island|   8.0|
|  1069|   1121|  Warlock From Titan|   9.0|
|  1069|   1439|The Last Desert O...|  10.0|
|  1069|   1108|Prince Of The Gol...|   9.0|
|  1069|    635|        Silent Order|   1.0|
+------+-------+--------------------+------+
only showing top 5 rows



## Dividir en entrenamiento y prueba

In [37]:
training, test = ratings_df.randomSplit([0.8, 0.2])

# Configure ALS model

In [38]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
)

# TRAINNING

In [39]:
model = als.fit(training)

# PREDICTIONS

In [40]:
# Generate recommendations for each user
recommendations = model.recommendForAllUsers(5)

# Show recommendations
recommendations.show(5, truncate=False)



+------+------------------------------------------------------------------------------------------+
|userId|recommendations                                                                           |
+------+------------------------------------------------------------------------------------------+
|0     |[{710, 10.261556}, {1488, 10.212853}, {1231, 9.979702}, {593, 9.9467535}, {287, 9.852462}]|
|1     |[{250, 10.571926}, {1530, 9.85277}, {793, 9.715438}, {589, 9.623798}, {1987, 9.615686}]   |
|2     |[{250, 8.705419}, {589, 8.440251}, {47, 8.160182}, {793, 8.14972}, {1463, 7.905716}]      |
|3     |[{825, 10.197342}, {720, 9.877963}, {613, 9.795914}, {1270, 9.778091}, {1943, 9.524079}]  |
|4     |[{1349, 9.704831}, {1943, 9.321889}, {996, 9.030924}, {1565, 8.829642}, {1510, 8.651744}] |
+------+------------------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

## Recomendaciones Plus

In [41]:
recommendations_exp = recommendations.withColumn(
    "rec", explode("recommendations")
).select(
    "userId",
    col("rec.movieId").alias("movieId"),
    col("rec.rating").alias("predicted_rating"),
)

unique_titles = ratings_df.select("movieId", "movieTitle").dropDuplicates(["movieId"])
recs_final = recommendations_exp.join(unique_titles, on="movieId", how="left")
recs_final.show(10, truncate=False)

                                                                                

+-------+------+----------------+-------------------------------+
|movieId|userId|predicted_rating|movieTitle                     |
+-------+------+----------------+-------------------------------+
|710    |0     |10.261556       |Beyond Xylos                   |
|1488   |0     |10.212853       |The River'S Gambit             |
|1231   |0     |9.979702        |Ancient Beginning              |
|593    |0     |9.9467535       |The Steel Knight               |
|287    |0     |9.852462        |Chronicles Of Andromeda        |
|250    |1     |10.571926       |Curse Of The Silent Wolf       |
|1530   |1     |9.85277         |Legend Of The Secrets          |
|793    |1     |9.715438        |Warlock From Cygnus X-1        |
|589    |1     |9.623798        |The Last Prophecy Of Cygnus X-1|
|1987   |1     |9.615686        |The Martian Asteroid           |
+-------+------+----------------+-------------------------------+
only showing top 10 rows



## Predictions for all data

In [42]:
predictions = model.transform(test)
predictions.show(truncate=False)

+------+-------+-------------------------------+------+----------+
|userId|movieId|movieTitle                     |rating|prediction|
+------+-------+-------------------------------+------+----------+
|32    |1229   |Pact: A Twisted Game           |1.0   |3.320629  |
|32    |1818   |Cosmic Chaos                   |5.0   |2.4053178 |
|68    |409    |The Fatal Case Of The Pact     |2.0   |1.9934825 |
|69    |1082   |The Hidden Shadow              |8.0   |4.160407  |
|131   |1241   |The Dimensional Comet          |1.0   |0.4048561 |
|131   |1827   |Asteroid From Serpent'S Coil   |1.0   |2.9301553 |
|131   |1891   |Serpent'S Coil: Talisman Rising|4.0   |4.8730907 |
|1070  |147    |The Last Ocean Of Raven'S Peak |6.0   |0.25301555|
|1070  |1435   |The Phantom River              |4.0   |2.2409725 |
|1070  |1643   |The Galactic Serpent           |9.0   |2.3113923 |
|1088  |17     |Titan: Curse Rising            |7.0   |0.7081719 |
|1115  |603    |When The Guardian Rise         |8.0   |1.98616

# EVALUATE MODEL

In [43]:
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)
# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print("\n[INFO] RMSE del modelo:", round(rmse, 4))


[INFO] RMSE del modelo: 4.2162


## Exportar a CSV para Power BI

In [None]:
# Convertir a pandas y guardar directamente
try:
    print("Convirtiendo a pandas...")
    pandas_df = recs_final.toPandas()
    print(f"DataFrame convertido a pandas con {len(pandas_df)} filas")
    
    # Definir ruta absoluta
    import os
    csv_path = os.path.join(os.getcwd(), "final_recommendations.csv")
    
    # Guardar con pandas
    pandas_df.to_csv(csv_path, index=False)
    print(f"Archivo guardado con pandas en: {csv_path}")
    
    # Verificar que existe
    if os.path.exists(csv_path):
        print(f"¡Éxito! Archivo creado con tamaño: {os.path.getsize(csv_path)} bytes")
    else:
        print("Error: El archivo no se creó")
except Exception as e:
    print(f"Error durante la exportación: {str(e)}")

Guardando como parquet...


                                                                                

Leyendo parquet...
Error: [UNABLE_TO_INFER_SCHEMA] Unable to infer schema for Parquet. It must be specified manually.


In [None]:
sc.stop()