In [1]:
// Most watched movie

final case class Movie(userID: Int, movieID: Int, rating:Int,timestamp:Long )

Intitializing Scala interpreter ...

Spark Web UI available at http://78598ab614c0:4041
SparkContext available as 'sc' (version = 3.1.1, master = local[*], app id = local-1625707563495)
SparkSession available as 'spark'


defined class Movie


In [2]:
import org.apache.spark.sql.types.{StructType, IntegerType, LongType}
val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.as[Movie]

ds.show(5)

val topMovies = ds.groupBy("movieID").count().orderBy(desc("count"))

topMovies.show(10)

+------+-------+------+---------+
|userID|movieID|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
+-------+-----+
only showing top 10 rows



import org.apache.spark.sql.types.{StructType, IntegerType, LongType}
ds: org.apache.spark.sql.Dataset[Movie] = [userID: int, movieID: int ... 2 more fields]
topMovies: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [movieID: int, count: bigint]


In [3]:
//-----------------------------------------------------------------
// Join movie name: faslt lookup using in memory broadcasted map
import scala.io.{Codec, Source}
import org.apache.spark.sql.functions.{col, udf}

def loadMovieNames(): Map[Int, String] = {
    // Codec of the file
    implicit val codec = Codec("ISO-8859-1")
    
    var movieNames:Map[Int,String] = Map()
    
    // Load the Map with the content of the file
    val lines = Source.fromFile("data/ml-100k/u.item")
    for (line <- lines.getLines()){
        val fields = line.split("|")
        if(fields.length > 1){
            movieNames += (fields(0).toInt -> fields(1))
        }
    }
    lines.close()
    movieNames
}

val nameDict = sc.broadcast(loadMovieNames())

val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.as[Movie]

ds.show(5)

val movieCounts = ds.groupBy("movieID").count()

// User Defined Function
val lookupName : Int => String = (movieId:Int) => {
    nameDict.value(movieId)
}

// Wrap it with a UDF
val lookupNameUDF = udf(lookupName)

// Add movie title from the lookup table
val movieCountsWithName = movieCounts.withColumn("movieTitle",lookupNameUDF(col("movieId")) )

val topMovies = movieCountsWithName.sort(desc("count"))

//topMovies.show(5) // Exception =/
println(topMovies.count())

+------+-------+------+---------+
|userID|movieID|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

1682


import scala.io.{Codec, Source}
import org.apache.spark.sql.functions.{col, udf}
loadMovieNames: ()Map[Int,String]
nameDict: org.apache.spark.broadcast.Broadcast[Map[Int,String]] = Broadcast(5)
ds: org.apache.spark.sql.Dataset[Movie] = [userID: int, movieID: int ... 2 more fields]
movieCounts: org.apache.spark.sql.DataFrame = [movieID: int, count: bigint]
lookupName: Int => String = $Lambda$4162/0x0000000841684840@635d841
lookupNameUDF: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4162/0x0000000841684840@635d841,StringType,List(Some(class[value[0]: int])),Some(class[value[0]: string]),None,true,true)
movieCountsWithName: org.apache.spark.sql.DataFrame = [movieID: int, count: bigint ... 1 more field]
topMovies: org.apache.spark.sql.Dataset[org.a...


In [4]:
//-----------------------------------------------------------------
// Let's try the above with joins
case class MovieRating(movieID: Int)
case class MovieItem(movieID: Int, movieTitle:String)

defined class MovieRating
defined class MovieItem


In [5]:
import org.apache.spark.sql.types.{StructType, IntegerType, LongType, StringType}
val ds = spark.read
.option("sep", "\t")
.schema(
    new StructType()
    .add("userID", IntegerType, nullable= true)
    .add("movieID", IntegerType, nullable= true)
    .add("rating", IntegerType, nullable= true)
    .add("timestamp", LongType, nullable = true)
)
.csv("data/ml-100k/u.data")
.select("movieID")
.as[MovieRating]

ds.show(5)

val dsTitle = spark.read
.option("sep", "|")
.schema(
    new StructType()
    .add("movieID", IntegerType, nullable= true)
    .add("movieTitle", StringType, nullable= true)
    .add("releaseDate", StringType, nullable= true)
    .add("empty", StringType, nullable= true)
    .add("url", StringType, nullable= true)
)
.csv("data/ml-100k/u.item")
.select("movieID", "movieTitle")
.as[MovieItem]

dsTitle.show(5)

val topMovies = ds.groupBy("movieID").count()
.join(dsTitle, "movieId")
.sort(desc("count"))
.select(
    col("movieID").alias("ID"),
    col("movieTitle").alias("Movie Name"),
    col("count").alias("# of Ratings")
)

topMovies.show(10)

+-------+
|movieID|
+-------+
|    242|
|    302|
|    377|
|     51|
|    346|
+-------+
only showing top 5 rows

+-------+-----------------+
|movieID|       movieTitle|
+-------+-----------------+
|      1| Toy Story (1995)|
|      2| GoldenEye (1995)|
|      3|Four Rooms (1995)|
|      4|Get Shorty (1995)|
|      5|   Copycat (1995)|
+-------+-----------------+
only showing top 5 rows

+---+--------------------+------------+
| ID|          Movie Name|# of Ratings|
+---+--------------------+------------+
| 50|    Star Wars (1977)|         583|
|258|      Contact (1997)|         509|
|100|        Fargo (1996)|         508|
|181|Return of the Jed...|         507|
|294|    Liar Liar (1997)|         485|
|286|English Patient, ...|         481|
|288|       Scream (1996)|         478|
|  1|    Toy Story (1995)|         452|
|300|Air Force One (1997)|         431|
|121|Independence Day ...|         429|
+---+--------------------+------------+
only showing top 10 rows



import org.apache.spark.sql.types.{StructType, IntegerType, LongType, StringType}
ds: org.apache.spark.sql.Dataset[MovieRating] = [movieID: int]
dsTitle: org.apache.spark.sql.Dataset[MovieItem] = [movieID: int, movieTitle: string]
topMovies: org.apache.spark.sql.DataFrame = [ID: int, Movie Name: string ... 1 more field]


## Superhero Graph

In [6]:
case class GraphLine(line: String)
case class GraphName(id: String, name: String)

defined class GraphLine
defined class GraphName


In [7]:
// Find the hero with most connections

import org.apache.spark.sql.types.{StructType, StringType}
    
val raw = spark.read
.schema(new StructType().add("line", StringType, nullable = true))
.option("inferSchema", "true")
.text("data/Marvel-graph.txt")
.as[GraphLine]

val df = raw
.withColumn("id", split(raw("line"), " ")(0))
.withColumn("count", size(split(raw("line"), " ")) - 1)
.select("id", "count")

val dfAgg = df.groupBy("id").agg(sum("count").alias("totalConnections"))

val names = spark.read
.schema(new StructType()
        .add("id", StringType, nullable=true)
       .add("name", StringType, nullable=true)
)
.option("sep", " ")
.csv("data/Marvel-names.txt")
.as[GraphName]

val popularity = dfAgg.join(names, "id").sort(desc("totalConnections"))

popularity.show(5)

//println(names.filter($"id" === dfAgg.first()(0)).first().name)

val popularHero = popularity.first()

println(s"${popularHero(2)} is the most popular super hero with ${popularHero(1)} appearances.")

+----+----------------+--------------------+
|  id|totalConnections|                name|
+----+----------------+--------------------+
| 859|            1937|     CAPTAIN AMERICA|
|5306|            1745|SPIDER-MAN/PETER PAR|
|2664|            1532|IRON MAN/TONY STARK |
|5716|            1429|THING/BENJAMIN J. GR|
|6306|            1397|    WOLVERINE/LOGAN |
+----+----------------+--------------------+
only showing top 5 rows

CAPTAIN AMERICA is the most popular super hero with 1937 appearances.


import org.apache.spark.sql.types.{StructType, StringType}
raw: org.apache.spark.sql.Dataset[GraphLine] = [line: string]
df: org.apache.spark.sql.DataFrame = [id: string, count: int]
dfAgg: org.apache.spark.sql.DataFrame = [id: string, totalConnections: bigint]
names: org.apache.spark.sql.Dataset[GraphName] = [id: string, name: string]
popularity: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, totalConnections: bigint ... 1 more field]
popularHero: org.apache.spark.sql.Row = [859,1937,CAPTAIN AMERICA]


In [8]:
// Do the same with RDD

val rawRDD = sc.textFile("data/Marvel-graph.txt")

val processRDD = rawRDD.map( x => (x.split(" ")(0), x.split(" ").size))

val reduceRDD = processRDD.reduceByKey( (acc, el) => acc + el )

val popularHeroRDDID = reduceRDD.map( x => (x._2, x._1) ).max()

val popularHeroRDD = names.filter($"id" === popularHeroRDDID._2).first()

println(s"${popularHeroRDD.name} is the most popular super hero with ${popularHeroRDDID._1} appearances.")

CAPTAIN AMERICA is the most popular super hero with 1937 appearances.


rawRDD: org.apache.spark.rdd.RDD[String] = data/Marvel-graph.txt MapPartitionsRDD[71] at textFile at <console>:36
processRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[72] at map at <console>:38
reduceRDD: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[73] at reduceByKey at <console>:40
popularHeroRDDID: (Int, String) = (1937,859)
popularHeroRDD: GraphName = GraphName(859,CAPTAIN AMERICA)


In [9]:
// Most "obscure" (heroes with less connections) hero

val popularityReversed = popularity.sort("totalConnections")

val minConnections = popularityReversed.first()(1)

val obscureHeroes = popularityReversed.where($"totalConnections" === minConnections)

println(s"List of obscure heroes - containing only ${minConnections} connection(s): ")

obscureHeroes.foreach(x => println(x(2)))

List of obscure heroes - containing only 1 connection(s): 
BERSERKER II
BLARE/
MARVEL BOY II/MARTIN
MARVEL BOY/MARTIN BU
GIURESCU, RADU
CLUMSY FOULUP
FENRIS
RANDAK
SHARKSKIN
CALLAHAN, DANNY
DEATHCHARGE
RUNE
SEA LEOPARD
RED WOLF II
ZANTOR
JOHNSON, LYNDON BAIN
LUNATIK II
KULL
GERVASE, LADY ALYSSA


popularityReversed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, totalConnections: bigint ... 1 more field]
minConnections: Any = 1
obscureHeroes: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, totalConnections: bigint ... 1 more field]


### BREADTH FIRST SEARCH to find degrees of separations
![BFS](https://lh3.googleusercontent.com/proxy/hqz6ixlITI9tAQ11XQz71SFQG1DYAJruhALw0ORl2RhNSBQ1lW_JrfDJYzoPfdiEGty_IPGR8gNoarlmI-KHDCLYkDn0sd5PSvLDZJKQhiPsOTtKb-Bkod80EcUCDhU1Nn54gdTQ8AoF6-8k3nmvUpJNavRL4InJGLZn "BFS")

In [10]:
// Degrees of separation: BREADTH-FIRST SEARCH

import org.apache.spark.util.LongAccumulator
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer

// The characters that we want to find the separation between
val startCharacterID = 5306 // Spiderman
val targetCharacterID = 14 // ADAM 3,031

// Accumulator
var hitCounter:Option[LongAccumulator] = None

// Custom Data Types
type BFSData = (Array[Int], Int, String)
type BFSNode = (Int,BFSData)

def convertToBFS(line:String):BFSNode = {
    
    val fields = line.split("\\s+")
    
    val heroID = fields(0).toInt
    
    var connections: ArrayBuffer[Int] = ArrayBuffer()
    for(connection <- 1 until (fields.length - 1)){
        connections += fields(connection).toInt
    }
    
    var color:String = "WHITE"
    var distance:Int = 9999
    
    if(heroID == startCharacterID){
        color = "GRAY"
        distance = 0
    }
    
    (heroID, (connections.toArray ,distance,color))
}

def creatingStartingRdd(sc:SparkContext):RDD[BFSNode] = {
    val inputFile = sc.textFile("data/Marvel-graph.txt")
    inputFile.map(convertToBFS)
}
def bfsMap(node:BFSNode): Array[BFSNode] = {

    // Extract data from the BFSNode
    val characterID:Int = node._1
    val data:BFSData = node._2

    val connections:Array[Int] = data._1
    val distance:Int = data._2
    var color:String = data._3

    // This is called from flatMap, so we return an array
    // of potentially many BFSNodes to add to our new RDD
    var results:ArrayBuffer[BFSNode] = ArrayBuffer()

    // Gray nodes are flagged for expansion, and create new
    // gray nodes for each connection
    if (color == "GRAY") {
        for (connection <- connections) {
            val newCharacterID = connection
            val newDistance = distance + 1
            val newColor = "GRAY"

            // Have we stumbled across the character we're looking for?
            // If so increment our accumulator so the driver script knows.
            if (targetCharacterID == connection) {
                if (hitCounter.isDefined) {
                    hitCounter.get.add(1)
                }
            }

            // Create our new Gray node for this connection and add it to the results
            val newEntry:BFSNode = (newCharacterID, (Array(), newDistance, newColor))
            results += newEntry
        }

        // Color this node as black, indicating it has been processed already.
        color = "BLACK"
    }

    // Add the original node back in, so its connections can get merged with 
    // the gray nodes in the reducer.
    val thisEntry:BFSNode = (characterID, (connections, distance, color))
    results += thisEntry

    return results.toArray
}

/** Combine nodes for the same heroID, preserving the shortest length and darkest color. */
def bfsReduce(data1:BFSData, data2:BFSData): BFSData = {

    // Extract data that we are combining
    val edges1:Array[Int] = data1._1
    val edges2:Array[Int] = data2._1
    val distance1:Int = data1._2
    val distance2:Int = data2._2
    val color1:String = data1._3
    val color2:String = data2._3

    // Default node values
    var distance:Int = 9999
    var color:String = "WHITE"
    var edges:ArrayBuffer[Int] = ArrayBuffer()

    // See if one is the original node with its connections.
    // If so preserve them.
    if (edges1.length > 0) {
        edges ++= edges1
    }
    if (edges2.length > 0) {
        edges ++= edges2
    }

    // Preserve minimum distance
    if (distance1 < distance) {
        distance = distance1
    }
    if (distance2 < distance) {
        distance = distance2
    }

    // Preserve darkest color
    if (color1 == "WHITE" && (color2 == "GRAY" || color2 == "BLACK")) {
        color = color2
    }
    if (color1 == "GRAY" && color2 == "BLACK") {
        color = color2
    }
    if (color2 == "WHITE" && (color1 == "GRAY" || color1 == "BLACK")) {
        color = color1
    }
    if (color2 == "GRAY" && color1 == "BLACK") {
        color = color1
    }

    return (edges.toArray, distance, color)
}

import org.apache.spark.util.LongAccumulator
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer
startCharacterID: Int = 5306
targetCharacterID: Int = 14
hitCounter: Option[org.apache.spark.util.LongAccumulator] = None
defined type alias BFSData
defined type alias BFSNode
convertToBFS: (line: String)BFSNode
creatingStartingRdd: (sc: org.apache.spark.SparkContext)org.apache.spark.rdd.RDD[BFSNode]
bfsMap: (node: BFSNode)Array[BFSNode]
bfsReduce: (data1: BFSData, data2: BFSData)BFSData


In [11]:
def run(){
   hitCounter = Some(sc.longAccumulator("Hit Counter"))

    var iterationRdd = creatingStartingRdd(sc)

    var iteration:Int = 0
    for (iteration <- 1 to 10) {
      println("Running BFS Iteration# " + iteration)

      // Create new vertices as needed to darken or reduce distances in the
      // reduce stage. If we encounter the node we're looking for as a GRAY
      // node, increment our accumulator to signal that we're done.
      val mapped = iterationRdd.flatMap(bfsMap)

      // Note that mapped.count() action here forces the RDD to be evaluated, and
      // that's the only reason our accumulator is actually updated.  
      println("Processing " + mapped.count() + " values.")

      if (hitCounter.isDefined) {
        val hitCount = hitCounter.get.value
        if (hitCount > 0) {
          println("Hit the target character! From " + hitCount + 
              " different direction(s).")
          return
        }
      }

      // Reducer combines data for each character ID, preserving the darkest
      // color and shortest path.      
      iterationRdd = mapped.reduceByKey(bfsReduce)
    } 
}

run()

Running BFS Iteration# 1
Processing 8326 values.
Running BFS Iteration# 2
Processing 218067 values.
Hit the target character! From 1 different direction(s).


run: ()Unit


## Item-based collaborative filtering

Recommendation system.

In [15]:
// Movie similarity

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

case class Movies(userID: Int, movieID: Int, rating: Int, timestamp: Long)
case class MoviesNames(movieID: Int, movieTitle: String)
case class MoviePairs(movie1: Int, movie2: Int, rating1: Int, rating2: Int)
case class MoviePairsSimilarity(movie1: Int, movie2: Int, score: Double, numPairs: Long)

def computeCosineSimilarity(spark: SparkSession, data: Dataset[MoviePairs]): Dataset[MoviePairsSimilarity] = {
    // Compute xx, xy and yy columns
    val pairScores = data
    .withColumn("xx", col("rating1") * col("rating1"))
    .withColumn("yy", col("rating2") * col("rating2"))
    .withColumn("xy", col("rating1") * col("rating2"))

    // Compute numerator, denominator and numPairs columns
    val calculateSimilarity = pairScores
    .groupBy("movie1", "movie2")
    .agg(
    sum(col("xy")).alias("numerator"),
    (sqrt(sum(col("xx"))) * sqrt(sum(col("yy")))).alias("denominator"),
    count(col("xy")).alias("numPairs")
    )

    // Calculate score and select only needed columns (movie1, movie2, score, numPairs)
    import spark.implicits._
    val result = calculateSimilarity
    .withColumn("score",
    when(col("denominator") =!= 0, col("numerator") / col("denominator"))
    .otherwise(null)
    ).select("movie1", "movie2", "score", "numPairs").as[MoviePairsSimilarity]

    result
}

/** Get movie name by given movie id */
def getMovieName(movieNames: Dataset[MoviesNames], movieId: Int): String = {
    val result = movieNames.filter(col("movieID") === movieId)
    .select("movieTitle").collect()(0)

    result(0).toString
}


import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
defined class Movies
defined class MoviesNames
defined class MoviePairs
defined class MoviePairsSimilarity
computeCosineSimilarity: (spark: org.apache.spark.sql.SparkSession, data: org.apache.spark.sql.Dataset[MoviePairs])org.apache.spark.sql.Dataset[MoviePairsSimilarity]
getMovieName: (movieNames: org.apache.spark.sql.Dataset[MoviesNames], movieId: Int)String


In [17]:
 def runMovieSimilarity(args:Array[String]){
    val moviesNamesSchema = new StructType()
      .add("movieID", IntegerType, nullable = true)
      .add("movieTitle", StringType, nullable = true)

    // Create schema when reading u.data
    val moviesSchema = new StructType()
      .add("userID", IntegerType, nullable = true)
      .add("movieID", IntegerType, nullable = true)
      .add("rating", IntegerType, nullable = true)
      .add("timestamp", LongType, nullable = true)

    println("\nLoading movie names...")
    import spark.implicits._
    // Create a broadcast dataset of movieID and movieTitle.
    // Apply ISO-885901 charset
    val movieNames = spark.read
      .option("sep", "|")
      .option("charset", "ISO-8859-1")
      .schema(moviesNamesSchema)
      .csv("data/ml-100k/u.item")
      .as[MoviesNames]

    // Load up movie data as dataset
    val movies = spark.read
      .option("sep", "\t")
      .schema(moviesSchema)
      .csv("data/ml-100k/u.data")
      .as[Movies]

    val ratings = movies.select("userId", "movieId", "rating")

    // Emit every movie rated together by the same user.
    // Self-join to find every combination.
    // Select movie pairs and rating pairs
    val moviePairs = ratings.as("ratings1")
      .join(ratings.as("ratings2"), $"ratings1.userId" === $"ratings2.userId" && $"ratings1.movieId" < $"ratings2.movieId")
      .select($"ratings1.movieId".alias("movie1"),
        $"ratings2.movieId".alias("movie2"),
        $"ratings1.rating".alias("rating1"),
        $"ratings2.rating".alias("rating2")
      ).as[MoviePairs]

    val moviePairSimilarities = computeCosineSimilarity(spark, moviePairs).cache()

    if (args.length > 0) {
      val scoreThreshold = 0.97
      val coOccurrenceThreshold = 50.0

      val movieID: Int = args(0).toInt

      // Filter for movies with this sim that are "good" as defined by
      // our quality thresholds above
      val filteredResults = moviePairSimilarities.filter(
        (col("movie1") === movieID || col("movie2") === movieID) &&
          col("score") > scoreThreshold && col("numPairs") > coOccurrenceThreshold)

      // Sort by quality score.
      val results = filteredResults.sort(col("score").desc).take(10)

      println("\nTop 10 similar movies for " + getMovieName(movieNames, movieID))
      for (result <- results) {
        // Display the similarity result that isn't the movie we're looking at
        var similarMovieID = result.movie1
        if (similarMovieID == movieID) {
          similarMovieID = result.movie2
        }
        println(getMovieName(movieNames, similarMovieID) + "\tscore: " + result.score + "\tstrength: " + result.numPairs)
      }
    }
 }

runMovieSimilarity(Array("56"))


Loading movie names...

Top 10 similar movies for Pulp Fiction (1994)
Smoke (1995)	score: 0.9743848338030823	strength: 68
Reservoir Dogs (1992)	score: 0.9740674165782123	strength: 134
Donnie Brasco (1997)	score: 0.9738247291149608	strength: 75
Sling Blade (1996)	score: 0.9713796344244161	strength: 111
True Romance (1993)	score: 0.9707295689679896	strength: 99
Jackie Brown (1997)	score: 0.9706179145690377	strength: 55
Carlito's Way (1993)	score: 0.9706021261759088	strength: 52


runMovieSimilarity: (args: Array[String])Unit
