In [1]:
val rawData = sc.textFile("ml-100k/u.data")
println(rawData.first())

196	242	3	881250949


In [2]:
val rawRatings = rawData.map(_.split("\t").take(3))
println(rawRatings.first())

[Ljava.lang.String;@1356aa10


In [3]:
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble)}

In [4]:
println(ratings.first())

Rating(196,242,3.0)


In [5]:
val model = ALS.train(ratings, 50, 10, 0.01)
println(model.userFeatures)

users MapPartitionsRDD[210] at mapValues at ALS.scala:255


In [6]:
println(model.userFeatures.count)

943


In [7]:
println(model.productFeatures.count)

1682


In [8]:
val predictedRating = model.predict(789, 123)
println(predictedRating)

2.727786107746448


In [9]:
val userId = 789
val K = 10
val topKRecs = model.recommendProducts(userId, K)

println(topKRecs.mkString("\n"))

Rating(789,87,5.532445795990224)
Rating(789,56,5.3841005086294444)
Rating(789,715,5.135608782660923)
Rating(789,246,5.052365822874689)
Rating(789,127,5.021809954747658)
Rating(789,518,4.991299670310026)
Rating(789,13,4.981358707538808)
Rating(789,475,4.975747453008007)
Rating(789,100,4.963449395933944)
Rating(789,428,4.963430699869444)


In [10]:
val movies = sc.textFile("ml-100k/u.item")
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
println(titles(123))

Frighteners, The (1996)


In [11]:
val moviesForUser = ratings.keyBy(_.user).lookup(789)
println(moviesForUser.size)

33


In [12]:
moviesForUser.sortBy(-_.rating).take(10).map(rating=>(titles(rating.product), rating.rating)).foreach(println)

(Godfather, The (1972),5.0)
(Trainspotting (1996),5.0)
(Dead Man Walking (1995),5.0)
(Star Wars (1977),5.0)
(Swingers (1996),5.0)
(Leaving Las Vegas (1995),5.0)
(Bound (1996),5.0)
(Fargo (1996),5.0)
(Last Supper, The (1995),5.0)
(Private Parts (1997),4.0)


In [13]:
topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)

(Searching for Bobby Fischer (1993),5.532445795990224)
(Pulp Fiction (1994),5.3841005086294444)
(To Die For (1995),5.135608782660923)
(Chasing Amy (1997),5.052365822874689)
(Godfather, The (1972),5.021809954747658)
(Miller's Crossing (1990),4.991299670310026)
(Mighty Aphrodite (1995),4.981358707538808)
(Trainspotting (1996),4.975747453008007)
(Fargo (1996),4.963449395933944)
(Harold and Maude (1971),4.963430699869444)


In [14]:
//Cosine similairy approach
import org.jblas.DoubleMatrix
val aMatrix = new DoubleMatrix(Array(1.0, 2.0, 3.0))
println(aMatrix)

[1.000000; 2.000000; 3.000000]


In [16]:
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
    vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}

val itemId = 567
val itemFactor = model.productFeatures.lookup(itemId).head
val itemVector = new DoubleMatrix(itemFactor)
println(cosineSimilarity(itemVector, itemVector))

1.0000000000000002


In [19]:
val sims = model.productFeatures.map{ case (id, factor) =>
    val factorVector = new DoubleMatrix(factor)
    val sim = cosineSimilarity(factorVector, itemVector)
    (id, sim)
}

val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case
    (id, similarity) => similarity
})

println(sortedSims.take(10).mkString("\n"))

(567,1.0000000000000002)
(352,0.699628893722735)
(219,0.6937744401062669)
(436,0.6917255055466252)
(475,0.6827195734151007)
(335,0.6798762337271753)
(343,0.6690790746432513)
(1083,0.6686899502228364)
(813,0.6678519651274419)
(1007,0.6624126366680484)


In [20]:
println(titles(itemId))

Wes Craven's New Nightmare (1994)


In [21]:
val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] {
case(id, similarity) => similarity})

sortedSims2.slice(1,11).map{ case (id, sim) => (titles(id), sim)}.mkString("\n")

(Spice World (1997),0.699628893722735)
(Nightmare on Elm Street, A (1984),0.6937744401062669)
(American Werewolf in London, An (1981),0.6917255055466252)
(Trainspotting (1996),0.6827195734151007)
(How to Be a Player (1997),0.6798762337271753)
(Alien: Resurrection (1997),0.6690790746432513)
(Albino Alligator (1996),0.6686899502228364)
(Celluloid Closet, The (1995),0.6678519651274419)
(Waiting for Guffman (1996),0.6624126366680484)
(Scream (1996),0.6613601339010805)