In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f


spark = SparkSession.builder.appName("Chapter4-3").getOrCreate()

In [1]:
ratings = (
    spark.read.csv(
        path="../../data-sets/MovieLens-Small/ml-latest-small/ratings.csv",
        sep=",",
        header=True,
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    )
    # .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
    .drop("timestamp")
    .cache()
)

The ALS class has this signature:

```python
class pyspark.ml.recommendation.ALS(
    rank=10,
    maxIter=10,
    regParam=0.1,
    numUserBlocks=10,
    numItemBlocks=10,
    implicitPrefs=False,
    alpha=1.0,
    userCol="user",
    itemCol="item",
    seed=None,
    ratingCol="rating",
    nonnegative=False,
    checkpointInterval=10,
    intermediateStorageLevel="MEMORY_AND_DISK",
    finalStorageLevel="MEMORY_AND_DISK",
    coldStartStrategy="nan",
)
```

In [2]:
from pyspark.ml.recommendation import ALS

In [3]:
model = (
    ALS(
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
    ).fit(ratings)
)

In [4]:
predictions = model.transform(ratings)
predictions.show(10, False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|191   |148    |5.0   |4.9256597 |
|133   |471    |4.0   |3.2044077 |
|597   |471    |2.0   |3.7744582 |
|385   |471    |4.0   |3.1436024 |
|436   |471    |3.0   |3.3761165 |
|602   |471    |4.0   |3.7356088 |
|91    |471    |1.0   |2.5796928 |
|409   |471    |3.0   |3.8035347 |
|372   |471    |3.0   |3.1774716 |
|599   |471    |2.5   |2.9243267 |
+------+-------+------+----------+
only showing top 10 rows



In [5]:
model.userFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.57681924, -0....|
| 20|[0.19191976, -0.5...|
| 30|[-0.13157046, 0.0...|
| 40|[-0.19657418, -0....|
| 50|[0.067084774, -0....|
+---+--------------------+
only showing top 5 rows



In [6]:
model.itemFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.58131766, 0.0...|
| 20|[0.17235413, 0.14...|
| 30|[0.62102294, 0.16...|
| 40|[-0.45563155, -0....|
| 50|[-3.3535386E-4, -...|
+---+--------------------+
only showing top 5 rows

