In [1]:
%output --no-stdout

In [2]:
@file:Repository("https://binrepo.target.com/artifactory/gradle")
@file:Repository("https://binrepo.target.com/artifactory/maven-central")
@file:Repository("https://binrepo.target.com/artifactory/jcenter")
@file:Repository("https://binrepo.target.com/artifactory/jitpack-maven")
@file:Repository("https://binrepo.target.com/artifactory/kotlin-maven")
@file:Repository("https://binrepo.target.com/artifactory/apache-maven")
@file:Repository("https://binrepo.target.com/artifactory/jitpack")
%use spark

In [3]:
%output --reset-to-defaults
@file:DependsOn("org.jetbrains.kotlinx.spark:kotlin-spark-api-3.0.0_2.12:1.0.0-preview1")

In [4]:
import org.jetbrains.kotlinx.spark.api.*
import org.apache.spark.sql.functions.*
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.recommendation.ALSModel

In [5]:
val ratingFile = "data/ml-latest-small/ratings.csv"
val movieFile = "data/ml-latest-small/movies.csv"
val linkFile = "data/ml-latest-small/links.csv"
val tagFile = "data/ml-latest-small/tags.csv"

In [6]:
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Recommender").orCreate

In [99]:
val ratings = spark
        .read()
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(ratingFile)

In [100]:
val movies = spark
        .read()
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(movieFile)
        .withColumn("release_year", regexp_extract(col("title"), "\\s?\\((\\d{4})\\)", 1))
        .withColumn("title", regexp_replace(col("title"), "\\s?\\((\\d)\\)", ""))
        .withColumn("genres", split(col("genres"), "\\|"))

In [101]:
val (training, test) = ratings.randomSplit(doubleArrayOf(0.8, 0.2))

In [102]:
val als = ALS()
    .setMaxIter(5)
    .setRegParam(0.01)
    .setUserCol("userId")
    .setItemCol("movieId")
    .setRatingCol("rating")

In [103]:
val model = als.fit(training)

In [104]:
model.setColdStartStrategy("drop")

als_c340727be9a9

In [105]:
val predictions = model.transform(test)

In [106]:
val evaluator = RegressionEvaluator()
    .setMetricName("rmse")
    .setLabelCol("rating")
    .setPredictionCol("prediction")

In [107]:
val rmse = evaluator.evaluate(predictions)

In [108]:
println("root-mean-square error: $rmse")

root-mean-square error: 1.0854890520658376


In [60]:
val userRecs = model.recommendForAllUsers(10)

In [61]:
val movieRecs = model.recommendForAllItems(10)

In [62]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[25850, 8.084744...|
|   463|[[4649, 6.995293]...|
|   496|[[932, 10.834126]...|
|   148|[[50601, 6.198351...|
|   540|[[8633, 7.160297]...|
|   392|[[2126, 13.096552...|
|   243|[[2867, 12.202643...|
|    31|[[742, 7.3031893]...|
|   516|[[232, 9.474438],...|
|   580|[[3067, 6.6261635...|
|   251|[[8633, 12.470896...|
|   451|[[25850, 8.382637...|
|    85|[[1254, 6.286975]...|
|   137|[[417, 6.7616487]...|
|    65|[[932, 6.8192816]...|
|   458|[[1254, 8.257918]...|
|   481|[[5080, 9.849588]...|
|    53|[[535, 10.929663]...|
|   255|[[70946, 9.547336...|
|   588|[[3618, 6.7405443...|
+------+--------------------+
only showing top 20 rows



In [63]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[531, 5.2984786]...|
|   4900|[[598, 7.714996],...|
|   5300|[[461, 8.937195],...|
|   6620|[[531, 8.404721],...|
|   7340|[[259, 6.389276],...|
|  32460|[[278, 6.8668966]...|
|  54190|[[598, 7.388963],...|
|    471|[[531, 9.030875],...|
|   1591|[[126, 7.994179],...|
|   1342|[[77, 8.232385], ...|
|   2122|[[399, 10.527672]...|
|   2142|[[259, 9.31655], ...|
|   7982|[[258, 11.222481]...|
|  44022|[[598, 8.375917],...|
| 141422|[[295, 4.0579953]...|
| 144522|[[120, 4.384484],...|
|    833|[[259, 7.3443036]...|
|   5803|[[392, 8.139914],...|
|   7993|[[393, 5.033613],...|
|  76143|[[549, 7.6078835]...|
+-------+--------------------+
only showing top 20 rows



In [25]:
userRecs
    .withColumn("rec", explode(col("recommendations")))
    .select(col("userId"), col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
    .join(movies, "movieId")
    .orderBy(col("rating").desc())
    .select("userId", "movieId", "title", "release_year").show(10, false)

+------+-------+-----------------------------------------------+------------+
|userId|movieId|title                                          |release_year|
+------+-------+-----------------------------------------------+------------+
|485   |5080   |Slackers (2002)                                |2002        |
|461   |4941   |Flash Gordon (1980)                            |1980        |
|485   |135861 |Ted 2 (2015)                                   |2015        |
|461   |89118  |Skin I Live In, The (La piel que habito) (2011)|2011        |
|461   |1866   |Big Hit, The (1998)                            |1998        |
|461   |46062  |High School Musical (2006)                     |2006        |
|81    |86320  |Melancholia (2011)                             |2011        |
|461   |25771  |Andalusian Dog, An (Chien andalou, Un) (1929)  |1929        |
|461   |26258  |Topo, El (1970)                                |1970        |
|485   |2122   |Children of the Corn (1984)                    |

In [109]:
val users = ratings.filter("userId == 99999").select(als.getUserCol()).distinct()

In [110]:
val userSubsetRecs = model.recommendForUserSubset(users, 10)

In [111]:
userSubsetRecs
    .withColumn("rec", explode(col("recommendations")))
    .select(col("userId"), col("rec.movieId").alias("movieId"), col("rec.rating").alias("rating"))
    .join(movies, "movieId")
    .orderBy(col("rating").desc())
    .select("userId", "movieId", "title", "release_year").orderBy(col("userId")).show(30, false)

+------+-------+-------------------------------+------------+
|userId|movieId|title                          |release_year|
+------+-------+-------------------------------+------------+
|99999 |4109   |Flowers in the Attic (1987)    |1987        |
|99999 |3685   |Prizzi's Honor (1985)          |1985        |
|99999 |94677  |Dictator, The (2012)           |2012        |
|99999 |117590 |Horrible Bosses 2 (2014)       |2014        |
|99999 |7018   |Presumed Innocent (1990)       |1990        |
|99999 |5666   |Rules of Attraction, The (2002)|2002        |
|99999 |801    |Harriet the Spy (1996)         |1996        |
|99999 |104879 |Prisoners (2013)               |2013        |
|99999 |4974   |Not Another Teen Movie (2001)  |2001        |
|99999 |56145  |Mist, The (2007)               |2007        |
+------+-------+-------------------------------+------------+



In [98]:
movies.filter(col("title").like("Big%")).show()

+-------+--------------------+--------------------+------------+
|movieId|               title|              genres|release_year|
+-------+--------------------+--------------------+------------+
|     54|Big Green, The (1...|  [Children, Comedy]|        1995|
|     75|    Big Bully (1996)|     [Comedy, Drama]|        1996|
|    994|    Big Night (1996)|     [Comedy, Drama]|        1996|
|   1216|Big Blue, The (Gr...|[Adventure, Drama...|        1988|
|   1284|Big Sleep, The (1...|[Crime, Film-Noir...|        1946|
|   1732|Big Lebowski, The...|     [Comedy, Crime]|        1998|
|   1827| Big One, The (1997)|[Comedy, Document...|        1997|
|   1866| Big Hit, The (1998)|[Action, Comedy, ...|        1998|
|   2352|Big Chill, The (1...|     [Comedy, Drama]|        1983|
|   2694|    Big Daddy (1999)|            [Comedy]|        1999|
|   2797|          Big (1988)|[Comedy, Drama, F...|        1988|
|   3240|Big Tease, The (1...|            [Comedy]|        1999|
|   3368|Big Country, The

In [36]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows

