In [1]:
val movies = sc.textFile("ml-100k/u.item")
print(movies.first)

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

In [2]:
val genres = sc.textFile("ml-100k/u.genre")
genres.take(5).foreach(println)

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4


In [3]:
val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")).map(array => (array(1), array(0))).collectAsMap
println(genreMap)

Map(2 -> Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, 7 -> Documentary, 17 -> War, 1 -> Action, 4 -> Children's, 11 -> Horror, 14 -> Romance, 6 -> Crime, 0 -> unknown, 9 -> Fantasy, 16 -> Thriller, 3 -> Animation, 10 -> Film-Noir, 13 -> Mystery)


In [4]:
val titlesAndGenres = movies.map(_.split("\\|")).map{ array =>
    val genres = array.toSeq.slice(5, array.size)
    val genresAssigned = genres.zipWithIndex.filter{ case (g, idx) =>
        g == "1"
    }.map { case (g, idx) => 
        genreMap(idx.toString)
    }
    (array(0).toInt, (array(1), genresAssigned))
}
println(titlesAndGenres.first)

(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))


In [5]:
//Training the recommendation model
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

val rawData = sc.textFile("ml-100k/u.data")
val rawRatings = rawData.map(_.split("\t").take(3))
val ratings = rawRatings.map{ case Array(user, movie, rating) =>
    Rating(user.toInt, movie.toInt, rating.toDouble)
}
ratings.cache
val alsModel = ALS.train(ratings, 50, 10, 0.1)

In [6]:
import org.apache.spark.mllib.linalg.Vectors
val movieFactors = alsModel.productFeatures.map{ case (id, factor) =>
    (id, Vectors.dense(factor))
}
val movieVectors = movieFactors.map(_._2)
val userFactors = alsModel.userFeatures.map { case (id, factor) =>
    (id, Vectors.dense(factor))
}
val userVectors = userFactors.map(_._2)

In [8]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

val movieMatrix = new RowMatrix(movieVectors)
val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()
val userMatrix = new RowMatrix(userVectors)
val userMatrixSummary = userMatrix.computeColumnSummaryStatistics()
println("Movie factors mean: " + movieMatrixSummary.mean)
println("Movie factors variance: " + movieMatrixSummary.variance)
println("User factors mean: " + userMatrixSummary.mean)
println("User factors variance: " + userMatrixSummary.variance)

Movie factors mean: [0.2023456278013444,-0.04963738562893034,0.08218398786665168,0.11263451117872496,-0.15559253068595333,-0.2447971873733965,-0.059198443220822884,-0.028449318549935192,-0.4471871985258771,0.059438231975972075,0.12556779141307087,0.005243797720868528,-0.1769953971413361,-0.401243887483181,0.35760016906530323,0.05452279459350239,-0.10510818999507471,0.24417377235132057,-0.13876186639707522,-0.031620801613490936,-0.14214234602334155,-0.021193827218173998,-0.31084781261321554,-0.08879711594943235,-0.06509388520308639,-0.06435760057926557,0.18619553213960352,-0.03663261717278913,-0.4118309044146445,0.11243916438295269,0.1645287189366628,0.16689908518896118,-0.032056633921664156,0.3814835926721606,-0.28709520656198073,0.3199182076736584,-0.02332507786305151,0.11282535358053003,0.07478791526876594,0.05052759392713639,-0.051556927091664796,0.1392942289706306,-0.1897084360341342,0.0744083964165318,0.36767396360210647,-0.04342123442971324,0.0815284416411412,0.3062136001256765,-

In [9]:
import org.apache.spark.mllib.clustering.KMeans
val numClusters = 5
val numIterations = 10
val numRuns = 3

val movieClusterModel = KMeans.train(movieVectors, numClusters, numIterations, numRuns)

In [10]:
val movieClusterModelConverged = KMeans.train(movieVectors, numClusters, 100)

In [11]:
val userClusterModel = KMeans.train(userVectors, numClusters, numIterations, numRuns)

In [12]:
val movie1 = movieVectors.first
val movieCluster = movieClusterModel.predict(movie1)
println(movieCluster)

1


In [13]:
val predictions = movieClusterModel.predict(movieVectors)
println(predictions.take(10).mkString(","))

1,1,1,1,1,0,4,1,2,2


In [14]:
import breeze.linalg._
import breeze.numerics.pow

def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum

In [15]:
val titlesWithFactors = titlesAndGenres.join(movieFactors)
val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) =>
    val pred = movieClusterModel.predict(vector)
    val clusterCentre = movieClusterModel.clusterCenters(pred)
    val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))
    (id, title, genres.mkString(" "), pred, dist)
}
val clusterAssignments = moviesAssigned.groupBy{ case (id, title, genres, cluster, dist) => cluster }.collectAsMap

In [17]:
for((k, v) <- clusterAssignments.toSeq.sortBy(_._1)){
    println(s"Cluster $k:")
    val m = v.toSeq.sortBy(_._5)
    println(m.take(20).map{ case (_, title, genres, _, d) =>
        (title, genres, d)}.mkString("\n"))
    println("======\n")
}

Cluster 0:
(All Over Me (1997),Drama,0.14804395282062405)
(Land and Freedom (Tierra y libertad) (1995),War,0.29204716029790245)
(Eighth Day, The (1996),Drama,0.29204716029790245)
(Dadetown (1995),Documentary,0.29204716029790245)
(Big One, The (1997),Comedy Documentary,0.29204716029790245)
(� k�ldum klaka (Cold Fever) (1994),Comedy Drama,0.29204716029790245)
(Girls Town (1996),Drama,0.29204716029790245)
(Silence of the Palace, The (Saimt el Qusur) (1994),Drama,0.29204716029790245)
(Normal Life (1996),Crime Drama,0.29204716029790245)
(Two Friends (1986) ,Drama,0.29204716029790245)
(Hana-bi (1997),Comedy Crime Drama,0.29204716029790245)
(Niagara, Niagara (1997),Drama,0.3046557942270118)
(Etz Hadomim Tafus (Under the Domin Tree) (1994),Drama,0.3046557942270118)
(All Things Fair (1996),Drama,0.3058774395384473)
(Foreign Student (1994),Drama,0.3148823855314407)
(Dream Man (1995),Thriller,0.3173035728772602)
(Angela (1995),Drama,0.3437517241732923)
(Collectionneuse, La (1967),Drama,0.37696975

In [19]:
//Evaluation
val movieCost = movieClusterModel.computeCost(movieVectors)
val userCost = userClusterModel.computeCost(userVectors)
println("WCSS for movies: " + movieCost)
println("WCSS for users: " + userCost)

WCSS for movies: 2274.276237774483
WCSS for users: 1482.1828109816408


In [21]:
//Finding K through Cross-Validation

val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123)
val trainMovies = trainTestSplitMovies(0)
val testMovies = trainTestSplitMovies(1)
val costMovies = Seq(2, 3, 4, 5, 10, 20).map{ k =>
    (k, KMeans.train(trainMovies, numIterations, k, numRuns).computeCost(testMovies))
}
println("Movie clustering cross-validation:")
costMovies.foreach{ case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f")}

Movie clustering cross-validation:
WCSS for K=2 id 897.28
WCSS for K=3 id 892.05
WCSS for K=4 id 880.34
WCSS for K=5 id 877.58
WCSS for K=10 id 878.58
WCSS for K=20 id 883.24


In [22]:
val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123)
val trainUsers = trainTestSplitUsers(0)
val testUsers = trainTestSplitUsers(1)

val costUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers))}
println("User clustering cross-validation:")
costUsers.foreach{ case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f")}

User clustering cross-validation:
WCSS for K=2 id 571.19
WCSS for K=3 id 569.72
WCSS for K=4 id 570.33
WCSS for K=5 id 564.48
WCSS for K=10 id 565.75
WCSS for K=20 id 560.95
