# **Hacemos import de las librerias & instalamos las dependencias de pyspark**

In [13]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import split

In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Waiting for headers] [Wai[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu

# **Inicializamos nuestro SparkSession**

In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
            .appName("ALS_Example") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# **Cargamos el .txt a un DF**

In [5]:
movies_df = spark.read.text("sample_movielens_ratings.txt")

In [7]:
movies_df.show()

+--------------------+
|               value|
+--------------------+
| 0::2::3::1424380312|
| 0::3::1::1424380312|
| 0::5::2::1424380312|
| 0::9::4::1424380312|
|0::11::1::1424380312|
|0::12::2::1424380312|
|0::15::1::1424380312|
|0::17::1::1424380312|
|0::19::1::1424380312|
|0::21::1::1424380312|
|0::23::1::1424380312|
|0::26::3::1424380312|
|0::27::1::1424380312|
|0::28::1::1424380312|
|0::29::1::1424380312|
|0::30::1::1424380312|
|0::31::1::1424380312|
|0::34::1::1424380312|
|0::37::1::1424380312|
|0::41::2::1424380312|
+--------------------+
only showing top 20 rows



# **Generamos las nuevas columnas a partir de los datos del txt. Podemos utilizar los "::" para separar las columnas**

In [10]:
movies_df = movies_df \
.withColumn("userId", split(movies_df["value"], "::").getItem(0)) \
.withColumn("movieId", split(movies_df["value"], "::").getItem(1)) \
.withColumn("rating", split(movies_df["value"], "::").getItem(2)) \
.withColumn("timestamp", split(movies_df["value"], "::").getItem(3))

**Podemos eliminar value, ya no se necesita**

In [23]:
movies_df = movies_df.drop("value")

In [24]:
movies_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
|     0|     23|     1|1424380312|
|     0|     26|     3|1424380312|
|     0|     27|     1|1424380312|
|     0|     28|     1|1424380312|
|     0|     29|     1|1424380312|
|     0|     30|     1|1424380312|
|     0|     31|     1|1424380312|
|     0|     34|     1|1424380312|
|     0|     37|     1|1424380312|
|     0|     41|     2|1424380312|
+------+-------+------+----------+
only showing top 20 rows



# **Necesitamos castear los valores de las columnas a INT**

In [25]:
for col_name in ["userId", "movieId", "rating"]:
    movies_df = movies_df.withColumn(col_name, movies_df[col_name].cast("int"))
movies_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
|     0|     23|     1|1424380312|
|     0|     26|     3|1424380312|
|     0|     27|     1|1424380312|
|     0|     28|     1|1424380312|
|     0|     29|     1|1424380312|
|     0|     30|     1|1424380312|
|     0|     31|     1|1424380312|
|     0|     34|     1|1424380312|
|     0|     37|     1|1424380312|
|     0|     41|     2|1424380312|
+------+-------+------+----------+
only showing top 20 rows



# **Entrenamos el modelo ALS**

In [26]:
als = ALS(
  userCol="userId",
  itemCol="movieId",
  ratingCol="rating",
  maxIter=10,
  regParam=0.1,
  rank=5, # Controls the dimensionality of the latent vector space for
  # users and items.
  coldStartStrategy="drop" # Avoids NaN predictions
)

# Train the model
model = als.fit(movies_df)

# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=3)
user_recommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.4490774},...|
|    10|[{93, 2.8908803},...|
|     0|[{92, 2.6682348},...|
|     1|[{22, 2.7923386},...|
|    21|[{29, 4.2831173},...|
|    11|[{32, 4.638387}, ...|
|    12|[{46, 5.5808306},...|
|    22|[{75, 4.412687}, ...|
|     2|[{93, 4.2631207},...|
|    13|[{29, 2.5850673},...|
|     3|[{51, 3.9958472},...|
|    23|[{46, 5.636417}, ...|
|     4|[{29, 3.2266839},...|
|    24|[{52, 4.3879285},...|
|    14|[{29, 4.66413}, {...|
|     5|[{46, 4.283429}, ...|
|    15|[{46, 3.2000859},...|
|    25|[{93, 2.9285555},...|
|    26|[{22, 4.9435816},...|
|     6|[{93, 3.0702496},...|
+------+--------------------+
only showing top 20 rows



In [28]:
# Generate predictions on the test set
predictions = model.transform(movies_df)
predictions.show(truncate=False)
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
metricName="rmse", labelCol="rating", predictionCol="prediction")
# Calculate RMSE
rmse = evaluator.evaluate(predictions)

+------+-------+------+----------+----------+
|userId|movieId|rating|timestamp |prediction|
+------+-------+------+----------+----------+
|26    |31     |1     |1424380312|0.50771856|
|27    |31     |1     |1424380312|1.2951624 |
|12    |31     |4     |1424380312|2.4628594 |
|13    |31     |1     |1424380312|1.5726469 |
|5     |31     |1     |1424380312|1.559071  |
|19    |31     |1     |1424380312|0.7830377 |
|4     |31     |1     |1424380312|1.7635086 |
|8     |31     |3     |1424380312|2.3926353 |
|7     |31     |3     |1424380312|1.9721277 |
|25    |31     |2     |1424380312|1.9625527 |
|24    |31     |1     |1424380312|1.6316519 |
|29    |31     |1     |1424380312|1.5836794 |
|14    |31     |3     |1424380312|2.153307  |
|0     |31     |1     |1424380312|1.3173116 |
|18    |31     |1     |1424380312|0.9731301 |
|28    |85     |1     |1424380312|0.96118134|
|26    |85     |1     |1424380312|1.7180077 |
|12    |85     |1     |1424380312|0.8278136 |
|1     |85     |3     |1424380312|