In [27]:
// Most watched movie

final case class Movie(userID: Int, movieID: Int, rating:Int,timestamp:Long )

defined class Movie


In [28]:
import org.apache.spark.sql.types.{StructType, IntegerType, LongType}
val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.as[Movie]

ds.show(5)

val topMovies = ds.groupBy("movieID").count().orderBy(desc("count"))

topMovies.show(10)

+------+-------+------+---------+
|userID|movieID|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
+-------+-----+
only showing top 10 rows



import org.apache.spark.sql.types.{StructType, IntegerType, LongType}
ds: org.apache.spark.sql.Dataset[Movie] = [userID: int, movieID: int ... 2 more fields]
topMovies: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [movieID: int, count: bigint]


In [32]:
//-----------------------------------------------------------------
// Join movie name: faslt lookup using in memory broadcasted map
import scala.io.{Codec, Source}
import org.apache.spark.sql.functions.{col, udf}

def loadMovieNames(): Map[Int, String] = {
    // Codec of the file
    implicit val codec = Codec("ISO-8859-1")
    
    var movieNames:Map[Int,String] = Map()
    
    // Load the Map with the content of the file
    val lines = Source.fromFile("data/ml-100k/u.item")
    for (line <- lines.getLines()){
        val fields = line.split("|")
        if(fields.length > 1){
            movieNames += (fields(0).toInt -> fields(1))
        }
    }
    lines.close()
    movieNames
}

val nameDict = sc.broadcast(loadMovieNames())

val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.as[Movie]

ds.show(5)

val movieCounts = ds.groupBy("movieID").count()

// User Defined Function
val lookupName : Int => String = (movieId:Int) => {
    nameDict.value(movieId)
}

// Wrap it with a UDF
val lookupNameUDF = udf(lookupName)

// Add movie title from the lookup table
val movieCountsWithName = movieCounts.withColumn("movieTitle",lookupNameUDF(col("movieId")) )

val topMovies = movieCountsWithName.sort(desc("count"))

//topMovies.show(5) // Exception =/
println(topMovies.count())

+------+-------+------+---------+
|userID|movieID|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

1682


import scala.io.{Codec, Source}
import org.apache.spark.sql.functions.{col, udf}
loadMovieNames: ()Map[Int,String]
nameDict: org.apache.spark.broadcast.Broadcast[Map[Int,String]] = Broadcast(117)
ds: org.apache.spark.sql.Dataset[Movie] = [userID: int, movieID: int ... 2 more fields]
movieCounts: org.apache.spark.sql.DataFrame = [movieID: int, count: bigint]
lookupName: Int => String = $Lambda$4633/0x00000008418ed040@2ad2766c
lookupNameUDF: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4633/0x00000008418ed040@2ad2766c,StringType,List(Some(class[value[0]: int])),Some(class[value[0]: string]),None,true,true)
movieCountsWithName: org.apache.spark.sql.DataFrame = [movieID: int, count: bigint ... 1 more field]
topMovies: org.apache.spark.sql.Dataset[o...


In [34]:
//-----------------------------------------------------------------
// Let's try the above with joins
case class MovieRating(movieID: Int)
case class MovieItem(movieID: Int, movieTitle:String)

defined class MovieRating
defined class MovieItem


In [67]:
import org.apache.spark.sql.types.{StructType, IntegerType, LongType, StringType}
val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.select("movieID")
.as[MovieRating]

ds.show(5)

val dsTitle = spark.read
.option("sep", "|")
.schema(
    new StructType()
    .add("movieID", IntegerType, nullable= true)
    .add("movieTitle", StringType, nullable= true)
    .add("releaseDate", StringType, nullable= true)
    .add("empty", StringType, nullable= true)
    .add("url", StringType, nullable= true)
)
.csv("data/ml-100k/u.item")
.select("movieID", "movieTitle")
.as[MovieItem]

dsTitle.show(5)

val topMovies = ds.groupBy("movieID").count()
.join(dsTitle, "movieId")
.sort(desc("count"))
.select(
    col("movieID").alias("ID"),
    col("movieTitle").alias("Movie Name"),
    col("count").alias("# of Ratings")
)

topMovies.show(10)

//val dsJoin = ds.join(dsTitle, "movieId")
//dsJoin.show(5)

+-------+
|movieID|
+-------+
|    242|
|    302|
|    377|
|     51|
|    346|
+-------+
only showing top 5 rows

+-------+-----------------+
|movieID|       movieTitle|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows

+---+--------------------+------------+
| ID|          Movie Name|# of Ratings|
+---+--------------------+------------+
| 50|    Star Wars (1977)|         583|
|258|      Contact (1997)|         509|
|100|        Fargo (1996)|         508|
|181|Return of the Jed...|         507|
|294|    Liar Liar (1997)|         485|
|286|English Patient, ...|         481|
|288|       Scream (1996)|         478|
|  1|    Toy Story (1995)|         452|
|300|Air Force One (1997)|         431|
|121|Independence Day ...|         429|
+---+--------------------+------------+
only showing top 10 rows



import org.apache.spark.sql.types.{StructType, IntegerType, LongType, StringType}
ds: org.apache.spark.sql.Dataset[MovieRating] = [movieID: int]
dsTitle: org.apache.spark.sql.Dataset[MovieItem] = [movieID: int, movieTitle: string]
topMovies: org.apache.spark.sql.DataFrame = [ID: int, Movie Name: string ... 1 more field]
