# **Hacemos import de las librerias & instalamos las dependencias de pyspark**

In [3]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import split

In [4]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Waiting for headers] [1 InRelease 2,586 B/129 kB 2%] [Connected to cloud.r-[0m                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [1 InRelease 38.8 kB/129 kB 30%] [Connected to cloud.r-project.org (108.139.[0m                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [3 InRelease 15.6 kB/128 kB 12%] [1 InRelease 38.8 kB/129 kB 30%] [Connected[0m                                                                               Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRel

# **Inicializamos nuestro SparkSession**

In [5]:
# Initialize SparkSession
spark = SparkSession.builder \
            .appName("ALS_Example") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# **Cargamos el .txt a un DF**

In [7]:
movies_df = spark.read.text("sample_movielens_ratings.txt")

In [8]:
movies_df.show()

+--------------------+
|               value|
+--------------------+
| 0::2::3::1424380312|
| 0::3::1::1424380312|
| 0::5::2::1424380312|
| 0::9::4::1424380312|
|0::11::1::1424380312|
|0::12::2::1424380312|
|0::15::1::1424380312|
|0::17::1::1424380312|
|0::19::1::1424380312|
|0::21::1::1424380312|
|0::23::1::1424380312|
|0::26::3::1424380312|
|0::27::1::1424380312|
|0::28::1::1424380312|
|0::29::1::1424380312|
|0::30::1::1424380312|
|0::31::1::1424380312|
|0::34::1::1424380312|
|0::37::1::1424380312|
|0::41::2::1424380312|
+--------------------+
only showing top 20 rows



# **Generamos las nuevas columnas a partir de los datos del txt. Podemos utilizar los "::" para separar las columnas**

In [9]:
movies_df = movies_df \
.withColumn("userId", split(movies_df["value"], "::").getItem(0)) \
.withColumn("movieId", split(movies_df["value"], "::").getItem(1)) \
.withColumn("rating", split(movies_df["value"], "::").getItem(2)) \
.withColumn("timestamp", split(movies_df["value"], "::").getItem(3))

**Podemos eliminar value, ya no se necesita**

In [10]:
movies_df = movies_df.drop("value")

In [11]:
movies_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
|     0|     23|     1|1424380312|
|     0|     26|     3|1424380312|
|     0|     27|     1|1424380312|
|     0|     28|     1|1424380312|
|     0|     29|     1|1424380312|
|     0|     30|     1|1424380312|
|     0|     31|     1|1424380312|
|     0|     34|     1|1424380312|
|     0|     37|     1|1424380312|
|     0|     41|     2|1424380312|
+------+-------+------+----------+
only showing top 20 rows



# **Necesitamos castear los valores de las columnas a INT**

In [12]:
for col_name in ["userId", "movieId", "rating"]:
    movies_df = movies_df.withColumn(col_name, movies_df[col_name].cast("int"))
movies_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|     3|1424380312|
|     0|      3|     1|1424380312|
|     0|      5|     2|1424380312|
|     0|      9|     4|1424380312|
|     0|     11|     1|1424380312|
|     0|     12|     2|1424380312|
|     0|     15|     1|1424380312|
|     0|     17|     1|1424380312|
|     0|     19|     1|1424380312|
|     0|     21|     1|1424380312|
|     0|     23|     1|1424380312|
|     0|     26|     3|1424380312|
|     0|     27|     1|1424380312|
|     0|     28|     1|1424380312|
|     0|     29|     1|1424380312|
|     0|     30|     1|1424380312|
|     0|     31|     1|1424380312|
|     0|     34|     1|1424380312|
|     0|     37|     1|1424380312|
|     0|     41|     2|1424380312|
+------+-------+------+----------+
only showing top 20 rows



# **Entrenamos el modelo ALS**

In [13]:
als = ALS(
  userCol="userId",
  itemCol="movieId",
  ratingCol="rating",
  maxIter=10,
  regParam=0.1,
  rank=5, # Controls the dimensionality of the latent vector space for
  # users and items.
  coldStartStrategy="drop" # Avoids NaN predictions
)

# Train the model
model = als.fit(movies_df)

# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=3)
user_recommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.4837117},...|
|    10|[{92, 2.8214915},...|
|     0|[{92, 2.6260037},...|
|     1|[{22, 2.8326561},...|
|    21|[{29, 4.2626824},...|
|    11|[{32, 4.972262}, ...|
|    12|[{46, 5.7948217},...|
|    22|[{75, 4.4489026},...|
|     2|[{83, 4.2652955},...|
|    13|[{93, 2.6517215},...|
|     3|[{30, 4.1485295},...|
|    23|[{46, 5.5981064},...|
|     4|[{29, 3.2051628},...|
|    24|[{52, 4.451611}, ...|
|    14|[{29, 4.6316557},...|
|     5|[{46, 4.2209864},...|
|    15|[{46, 3.4676778},...|
|    25|[{46, 3.1369336},...|
|    26|[{22, 4.7906337},...|
|     6|[{29, 3.155407}, ...|
+------+--------------------+
only showing top 20 rows



In [14]:
# Generate predictions on the test set
predictions = model.transform(movies_df)
predictions.show(truncate=False)
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
metricName="rmse", labelCol="rating", predictionCol="prediction")
# Calculate RMSE
rmse = evaluator.evaluate(predictions)

+------+-------+------+----------+----------+
|userId|movieId|rating|timestamp |prediction|
+------+-------+------+----------+----------+
|28    |0      |3     |1424380312|2.4594305 |
|28    |1      |1     |1424380312|1.5545201 |
|28    |2      |4     |1424380312|3.4125185 |
|28    |3      |1     |1424380312|0.77193713|
|28    |6      |1     |1424380312|0.7088782 |
|28    |7      |1     |1424380312|1.5160283 |
|28    |12     |5     |1424380312|3.1711469 |
|28    |13     |2     |1424380312|1.5199196 |
|28    |14     |1     |1424380312|1.1682084 |
|28    |15     |1     |1424380312|1.2067792 |
|28    |17     |1     |1424380312|1.1654925 |
|28    |19     |3     |1424380312|2.416662  |
|28    |20     |1     |1424380312|1.5896671 |
|28    |23     |3     |1424380312|2.2407577 |
|28    |24     |3     |1424380312|2.43106   |
|28    |27     |1     |1424380312|0.6416116 |
|28    |29     |1     |1424380312|1.1929783 |
|28    |33     |1     |1424380312|1.5706449 |
|28    |34     |1     |1424380312|

In [15]:
print(rmse)

0.5721643414752522
