In [1]:
%output --no-stdout

In [2]:
@file:Repository("https://binrepo.target.com/artifactory/gradle")
@file:Repository("https://binrepo.target.com/artifactory/maven-central")
@file:Repository("https://binrepo.target.com/artifactory/jcenter")
@file:Repository("https://binrepo.target.com/artifactory/jitpack-maven")
@file:Repository("https://binrepo.target.com/artifactory/kotlin-maven")
@file:Repository("https://binrepo.target.com/artifactory/apache-maven")
@file:Repository("https://binrepo.target.com/artifactory/jitpack")
%use spark

In [3]:
%output --reset-to-defaults
@file:DependsOn("org.jetbrains.kotlinx.spark:kotlin-spark-api-3.0.0_2.12:1.0.0-preview1")

In [4]:
import org.jetbrains.kotlinx.spark.api.*
import org.apache.spark.sql.functions.*
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.recommendation.ALSModel

In [5]:
val ratingFile = "data/ml-latest-small/ratings.csv"
val movieFile = "data/ml-latest-small/movies.csv"
val linkFile = "data/ml-latest-small/links.csv"
val tagFile = "data/ml-latest-small/tags.csv"

In [6]:
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Recommender").orCreate

In [7]:
val ratings = spark
        .read()
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(ratingFile)
        .select("userId", "movieId", "rating")

In [8]:
val movies = spark
        .read()
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(movieFile)
        .withColumn("release_year", regexp_extract(col("title"), "\\s?\\((\\d{4})\\)", 1))
        .withColumn("title", regexp_replace(col("title"), "\\s?\\((\\d)\\)", ""))
        .withColumn("genres", split(col("genres"), "\\|"))


In [9]:
movies.show(5)

+-------+--------------------+--------------------+------------+
|movieId|               title|              genres|release_year|
+-------+--------------------+--------------------+------------+
|      1|    Toy Story (1995)|[Adventure, Anima...|        1995|
|      2|      Jumanji (1995)|[Adventure, Child...|        1995|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|        1995|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|        1995|
|      5|Father of the Bri...|            [Comedy]|        1995|
+-------+--------------------+--------------------+------------+
only showing top 5 rows



In [10]:
val als = ALS()
    .setUserCol("userId")
    .setRatingCol("rating")
    .setItemCol("movieId")
    .setRank(1)
    .setMaxIter(20)
    .setRegParam(0.05)

In [11]:
val (trainingData, validationData) = ratings.randomSplit(doubleArrayOf(80.0, 20.0))

In [12]:
val model = als.fit(trainingData)

In [13]:
val evaluator = RegressionEvaluator()
    .setMetricName("rmse")
    .setLabelCol("rating")
    .setPredictionCol("prediction")

In [14]:
val predictions = model.transform(validationData).na().drop()

In [15]:
println("rmse for model $model: ${evaluator.evaluate(predictions)}")

rmse for model als_15127ddc2f29: 0.8849702711740748


In [16]:
predictions.show(5)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   597|    471|   2.0| 4.0000463|
|   436|    471|   3.0| 3.4843132|
|   176|    471|   5.0| 4.2127733|
|   136|    471|   4.0| 3.7279785|
|   469|    471|   5.0| 3.5710912|
+------+-------+------+----------+
only showing top 5 rows



In [29]:
val USER_ID = 133

In [None]:
val recAllUsers = model.recommendForAllUsers(5).cache()

In [None]:
recAllUsers.show(20, false)

In [None]:
val recommendationsForUser = recAllUsers.filter("userId == $USER_ID")
    .withColumn("rec", explode(col("recommendations")))
    .select(col("userId"), col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
    .join(movies, "movieId")
    .orderBy(desc("rating"))
    .select("userId", "movieId", "title", "release_year")

In [None]:
recommendationsForUser.show()

In [35]:
val subset = ratings.filter("userId == $USER_ID").select("userId")
val recSubset = model.recommendForUserSubset(subset, 10)
recSubset
    .withColumn("rec", explode(col("recommendations")))
    .select(col("userId"), col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
    .join(movies, "movieId")
    .orderBy(col("rating").desc())
    .select("userId", "movieId", "title", "release_year").show(10, false)

+------+-------+------------------------------------------------------------------------+------------+
|userId|movieId|title                                                                   |release_year|
+------+-------+------------------------------------------------------------------------+------------+
|133   |5919   |Android (1982)                                                          |1982        |
|133   |5181   |Hangar 18 (1980)                                                        |1980        |
|133   |6835   |Alien Contamination (1980)                                              |1980        |
|133   |5746   |Galaxy of Terror (Quest) (1981)                                         |1981        |
|133   |7899   |Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)|1975        |
|133   |136850 |Villain (1971)                                                          |1971        |
|133   |3567   |Bossa Nova (2000)                                        

In [104]:
val userIds = ratings.select("userId").distinct().takeAsList(5).map { it[0].toString().toInt() }

In [124]:
val subset = ratings.filter(col("userId").isInCollection(userIds))
val recSubset = model.recommendForUserSubset(subset, 5)

In [37]:
recSubset.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   133|[[6835, 6.3902907...|
+------+--------------------+



In [140]:
fun recommender(ratings: Dataset<Row>, movies: Dataset<Row>, users: List<Int>, model: ALSModel ): Dataset<Row> {
    val subset = ratings.filter(col("userId").isInCollection(users))
    val recSubset = model.recommendForUserSubset(subset, 5).withColumn("rec", explode(col("recommendations")))
    return recSubset
    .select(col("userId"), col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
    .join(movies, "movieId")
    .orderBy(col("rating").desc())
    .select("userId", "movieId", "title", "release_year")
}

In [143]:
recommender(ratings, movies, userIds, model).orderBy(col("userId").desc()).show()

+------+-------+--------------------+------------+
|userId|movieId|               title|release_year|
+------+-------+--------------------+------------+
|   496|  40491|Match Factory Gir...|        1990|
|   496|   5746|Galaxy of Terror ...|        1981|
|   496| 136850|      Villain (1971)|        1971|
|   496|   5764|       Looker (1981)|        1981|
|   496| 156605|            Paterson|            |
|   471|   5746|Galaxy of Terror ...|        1981|
|   471| 156605|            Paterson|            |
|   471|   5764|       Looker (1981)|        1981|
|   471| 136850|      Villain (1971)|        1971|
|   471|  40491|Match Factory Gir...|        1990|
|   463|   5764|       Looker (1981)|        1981|
|   463|  40491|Match Factory Gir...|        1990|
|   463| 136850|      Villain (1971)|        1971|
|   463|   5746|Galaxy of Terror ...|        1981|
|   463| 156605|            Paterson|            |
|   243| 136850|      Villain (1971)|        1971|
|   243|   5764|       Looker (

In [149]:
val userId = 150
val moviesToBeRated = ratings
    .filter("userId != $userId")
    .select("movieId").distinct()
    .withColumn("userId", org.jetbrains.kotlinx.spark.api.lit(userId))

In [150]:
val userMoviePredictions = model.transform(moviesToBeRated)

In [151]:
moviesToBeRated.show()

+-------+------+
|movieId|userId|
+-------+------+
|   1580|   150|
|   2366|   150|
|   3175|   150|
|   1088|   150|
|  32460|   150|
|  44022|   150|
|  96488|   150|
|   1238|   150|
|   1342|   150|
|   1591|   150|
|   1645|   150|
|   4519|   150|
|   2142|   150|
|    471|   150|
|   3997|   150|
|    833|   150|
|   3918|   150|
|   7982|   150|
|   1959|   150|
|  68135|   150|
+-------+------+
only showing top 20 rows



In [152]:
userMoviePredictions.show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|    148|   150|       NaN|
|    471|   150|  3.621155|
|    496|   150| 4.8066044|
|    833|   150| 2.1831114|
|   1088|   150|  3.383536|
|   1238|   150| 4.2309546|
|   1342|   150|  2.945063|
|   1580|   150| 3.4715548|
|   1591|   150|  2.869746|
|   1645|   150| 3.4382815|
|   1829|   150| 2.8945067|
|   1959|   150| 3.6811256|
|   2122|   150|  2.601224|
|   2142|   150| 2.8842258|
|   2366|   150| 3.6085236|
|   2659|   150|  1.811464|
|   2866|   150| 3.5963805|
|   3175|   150|  3.564415|
|   3794|   150|  2.394155|
|   3918|   150|  3.063353|
+-------+------+----------+
only showing top 20 rows



In [156]:
val recommendForUser = userMoviePredictions.na().drop().orderBy(col("prediction").desc()).limit(5).join(movies, "movieId")
.select(col("userId"), col("movieId"), col("title"), col("release_year"), col("prediction").alias("rating"))
    

In [158]:
recommendForUser.show(5, false)

+------+-------+-------------------------------------------------------+------------+--------+
|userId|movieId|title                                                  |release_year|rating  |
+------+-------+-------------------------------------------------------+------------+--------+
|150   |5746   |Galaxy of Terror (Quest) (1981)                        |1981        |8.373725|
|150   |5764   |Looker (1981)                                          |1981        |7.536352|
|150   |40491  |Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990)|1990        |7.452832|
|150   |136850 |Villain (1971)                                         |1971        |7.304771|
|150   |143367 |Silence (2016)                                         |2016        |6.707549|
+------+-------+-------------------------------------------------------+------------+--------+



In [25]:
fun recommendMovies(model: ALSModel, ratings: Dataset<Row>, movies: Dataset<Row>, user: Int, nbRecommendations: Int): Dataset<Row?>? {
    val dataSet = ratings.select("movieId").distinct().withColumn("userId", org.jetbrains.kotlinx.spark.api.lit(user))
    
    val moviesAlreadyRated = ratings.filter("userId == $user").select(col("movieId"), col("userId"))
    
    val notRated = dataSet.exceptAll(moviesAlreadyRated) 
    
    val predictions = model.transform(notRated)
            .na()
            .drop()
            .orderBy(col("prediction").desc())
            .limit(nbRecommendations)
            .select(col("movieId"), col("prediction"))
    
    val recommendations = predictions.join(movies, "movieId").select(col("movieId"), col("title"), col("genres"), col("prediction"))
    
    return recommendations
}

In [27]:
recommendMovies(model, ratings, movies, 133, 10)?.show()

+-------+--------------------+--------------------+----------+
|movieId|               title|              genres|prediction|
+-------+--------------------+--------------------+----------+
|   3567|   Bossa Nova (2000)|[Comedy, Drama, R...|  5.308017|
|   5181|    Hangar 18 (1980)|[Action, Sci-Fi, ...| 6.3902907|
|   5746|Galaxy of Terror ...|[Action, Horror, ...| 6.3902907|
|   5919|      Android (1982)|            [Sci-Fi]| 6.3902907|
|   6835|Alien Contaminati...|[Action, Horror, ...| 6.3902907|
|   7899|Master of the Fly...|            [Action]| 5.7512617|
|   8477|    Jetée, La (1962)|   [Romance, Sci-Fi]|  4.793838|
|  26810|Bad Boy Bubby (1993)|             [Drama]|  5.036862|
| 136850|      Villain (1971)|[Crime, Drama, Th...|  5.661717|
| 142422|The Night Before ...|            [Comedy]|  5.230884|
+-------+--------------------+--------------------+----------+



In [13]:
val dataSet = ratings.select("movieId").distinct().withColumn("userId", org.jetbrains.kotlinx.spark.api.lit(133))

In [15]:
dataSet.show(1)

+-------+------+
|movieId|userId|
+-------+------+
|   1580|   133|
+-------+------+
only showing top 1 row



In [17]:
val moviesAlreadyRated = ratings.filter("userId == 133").select(col("movieId"), col("userId"))

In [19]:
moviesAlreadyRated.show(1)

+-------+------+
|movieId|userId|
+-------+------+
|     32|   133|
+-------+------+
only showing top 1 row



In [20]:
val notRated = dataSet.exceptAll(moviesAlreadyRated)

In [21]:
notRated.show(5)

+-------+------+
|movieId|userId|
+-------+------+
|   4624|   133|
|  33132|   133|
|   6300|   133|
|  33679|   133|
| 158254|   133|
+-------+------+
only showing top 5 rows



In [22]:
val predictions = model.transform(notRated)
            .na()
            .drop()
            .orderBy(col("prediction").desc())
            .limit(10)
            .select(col("movieId"), col("prediction"))

In [23]:
predictions.show(5)

+-------+----------+
|movieId|prediction|
+-------+----------+
|   6835| 6.3902907|
|   5919| 6.3902907|
|   5181| 6.3902907|
|   5746| 6.3902907|
|   7899| 5.7512617|
+-------+----------+
only showing top 5 rows



In [24]:
predictions.join(movies, "movieId").select(col("movieId"), col("title"), col("genres"), col("prediction")).show()

+-------+--------------------+--------------------+----------+
|movieId|               title|              genres|prediction|
+-------+--------------------+--------------------+----------+
|   3567|   Bossa Nova (2000)|[Comedy, Drama, R...|  5.308017|
|   5181|    Hangar 18 (1980)|[Action, Sci-Fi, ...| 6.3902907|
|   5746|Galaxy of Terror ...|[Action, Horror, ...| 6.3902907|
|   5919|      Android (1982)|            [Sci-Fi]| 6.3902907|
|   6835|Alien Contaminati...|[Action, Horror, ...| 6.3902907|
|   7899|Master of the Fly...|            [Action]| 5.7512617|
|   8477|    Jetée, La (1962)|   [Romance, Sci-Fi]|  4.793838|
|  26810|Bad Boy Bubby (1993)|             [Drama]|  5.036862|
| 136850|      Villain (1971)|[Crime, Drama, Th...|  5.661717|
| 142422|The Night Before ...|            [Comedy]|  5.230884|
+-------+--------------------+--------------------+----------+

