In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!sudo apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!java -version

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [W                                                                               Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [W                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Waiting for headers] [C                                                                               Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://pp

In [6]:
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!ls | grep spark-3.4.1-bin-hadoop3.tgz
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!ls | grep spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!ls

spark-3.4.1-bin-hadoop3.tgz
spark-3.4.1-bin-hadoop3.tgz
spark-3.4.1-bin-hadoop3.tgz.1
drive	     spark-3.4.1-bin-hadoop3	  spark-3.4.1-bin-hadoop3.tgz.1
sample_data  spark-3.4.1-bin-hadoop3.tgz


In [7]:
!pip install -q findspark
!pip install -q pyspark
!pip install -q py4j

In [8]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("Sistema de Recomendación con ALS en Colab") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

spark

In [9]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, FloatType
from pyspark.sql.functions import split, col

# Path to file in Google Drive
data_path = "/content/drive/MyDrive/ColabNotebooks/datasets/sample_movielens_ratings.txt"

# Read file as text
raw_data = spark.read.text(data_path)

ratings_split = raw_data.select(
    split(col("value"), "::").alias("fields")
)

# Schema definition for df
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", LongType(), True)
])

# df creation
ratings_df = ratings_split.select(
    col("fields").getItem(0).cast(IntegerType()).alias("userId"),
    col("fields").getItem(1).cast(IntegerType()).alias("movieId"),
    col("fields").getItem(2).cast(FloatType()).alias("rating"),
    col("fields").getItem(3).cast(LongType()).alias("timestamp")
)

ratings_df.show(5)
ratings_df.printSchema()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: long (nullable = true)



In [10]:
from pyspark.ml.recommendation import ALS

(training, test) = ratings_df.randomSplit([0.8, 0.2])

# ALS configuration model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    maxIter=10,
    regParam=0.1,
    rank=5,
    coldStartStrategy="drop",
    nonnegative=True
)

# ALS model training
model = als.fit(training)

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

# Generate recommendations
user_recommendations = model.recommendForAllUsers(3)
user_recommendations.show(truncate=False)

# Generate predictions
predictions = model.transform(test)

# Show predictions
predictions.select("userId", "movieId", "rating", "prediction").show(5)

# RMSE value configurator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
# RMSE calculation
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

+------+---------------------------------------------------+
|userId|recommendations                                    |
+------+---------------------------------------------------+
|20    |[{90, 3.4165003}, {30, 3.32402}, {75, 3.0366833}]  |
|10    |[{92, 3.2579625}, {25, 3.2390852}, {93, 3.0996919}]|
|0     |[{25, 2.939826}, {92, 2.8942208}, {2, 2.784425}]   |
|1     |[{90, 3.4286823}, {32, 3.089238}, {94, 2.9519753}] |
|11    |[{32, 4.4135604}, {23, 4.3380527}, {30, 4.286111}] |
|21    |[{29, 3.829727}, {76, 3.786664}, {62, 3.6973033}]  |
|12    |[{46, 4.8411913}, {17, 3.9734108}, {90, 3.9050016}]|
|2     |[{93, 4.578247}, {47, 4.399918}, {25, 4.1694927}]  |
|22    |[{30, 4.9182096}, {69, 4.5331235}, {75, 4.3989725}]|
|13    |[{76, 3.0477524}, {93, 2.9302475}, {2, 2.6660068}] |
|23    |[{32, 4.9149413}, {90, 4.6317244}, {46, 4.5786777}]|
|3     |[{30, 4.398157}, {69, 4.2241826}, {75, 3.7531836}] |
|14    |[{29, 3.7284355}, {76, 3.6389103}, {52, 3.5584488}]|
|4     |[{92, 3.2126217}

In [12]:
spark.stop()