# WordCount Example

In this example we'll be using the Back to the Future transcript which is formatted as `Character: Line`. For example:

`Doc: Marty, is that you?`

In the first part we'll count the number of words in the transcript (we'll filter out the character names) and sort them by most frequently used to least frequently used.

In the second part we'll filter out common words, known as stop words, by importing a Python package using pip. 

Finally, we'll find the most common words used by each character.

## Part 1: Simple Word Count

In [1]:
import scala.collection.mutable.ArrayBuffer
import scala.util.matching.Regex

In [2]:
// Load the transcript using SparkContext.textFile
// This will return an RDD of strings - one for each line in the transcript 
val lines = sc.textFile("file:///usr/data/backtothefuture_transcript.txt")

In [3]:
// This function will be called for each line in the transcript
// We will strip out the character names (i.e. Marty:)
// We'll also strip out special characters in each string
// Finally, we'll return an array of words
def parseLine(line:String) : Array[String] = {
    var l = line.replaceAll("^[^:]+:", "");
    l = l.replaceAll("[^a-zA-Z ']", "");
    return l.toLowerCase.split("\\s+").filterNot(word => word == "")
}

In [4]:
// flatMap can map each input to 0 or more outputs
// In this case each line of text will be mapped to 0 or more words
val words = lines.flatMap(line => parseLine(line))

In [5]:
// Map each RDD to (key, 1) where key is the word
var wordCounts = words.map(x => (x, 1))

In [6]:
// reduceByKey takes 2 RDDs with the same key, combines them into a single RDD,
// and sets the value to the output of the lambda function
// In this case that value is x + y giving us the total count for each word (the key)
wordCounts = wordCounts.reduceByKey((x, y) => x + y)

In [7]:
// Here we reverse the RDDs, so instead of (word, count)
// They will be stored as (count, word)
// This will allow us to sort by the key (count)
val wordCountsReversed = wordCounts.map(x => (x._2, x._1))

In [8]:
// Sort by key (which is now count) descending
val wordCountsSorted = wordCountsReversed.sortByKey(false)

In [9]:
// Find the top 10 words
wordCountsSorted.take(10)

Array((382,you), (278,the), (242,i), (179,a), (166,to), (120,it), (116,that), (107,of), (100,this), (100,in))

## Part 2: Filter Out Stop Words

In [18]:
import scala.io.Source
val stopWordsRdd = sc.textFile("file:///usr/data/stopwords_en.txt")
val stopWordsArray = stopWordsRdd.toLocalIterator.toArray

In [19]:
def parseLine2(line:String) : Array[String] = {
    var l = line.replaceAll("^[^:]+:", "");
    l = l.replaceAll("[^a-zA-Z ']", "");
    val lineWords = l.toLowerCase.split("\\s+").filterNot(word => word == "")
    return lineWords.filterNot(word => stopWordsArray.contains(word))
}

In [20]:
val words = lines.flatMap(line => parseLine2(line))

In [21]:
val wordCounts = words.map(x => (x, 1)).reduceByKey((x, y) => x + y)

In [22]:
val wordCountsSorted = wordCounts.map(x => (x._2, x._1)).sortByKey(false)

In [23]:
wordCountsSorted.take(10)

Array((60,marty), (58,doc), (55,gonna), (46,hey), (46,yeah), (42,time), (42,uh), (41,george), (37,good), (35,mcfly))

## Part 3: Word Counts by Character

In [24]:
val contents = sc.wholeTextFiles("file:///usr/data/backtothefuture_transcript.txt")

In [118]:
def parseContents(contents:String): Array[(String, String)] = {
    val strs = contents.toLowerCase.split("(?m)(?=^[a-z^:]+:)")
    var key = ""
    var tuples = ArrayBuffer.empty[(String, String)]
    for (str <- strs) {
        var line = str
        val pattern = "(^[a-z^:]+:)(.+)".r
        val matched = pattern.findFirstMatchIn(str)
        matched match {
          case Some(m) =>
            key = m.group(1)
            key = key.substring(0,key.length-1)
            line = m.group(2)
          case _ =>
            // no match
        }
        if (key != "") {
            var words = parseLine2(str)
            for (word <- words) {
                tuples.append((key, word))
            }
        }
    }
    return tuples.toArray
}

In [119]:
val characterWordTuples = parseContents(contents.values.take(1)(0))
characterWordTuples.length

3371

In [120]:
val characterWords = sc.parallelize(characterWordTuples)

In [121]:
// Let's take a look at a few of them
characterWords.take(10)

Array((radio,october), (radio,inventory), (radio,time), (radio,statler), (radio,toyota), (radio,makingthe), (radio,deals), (radio,year), (radio,model), (radio,toyotas))

In [124]:
var characterWordCounts = characterWords.map(x => (x, 1))

In [126]:
characterWordCounts = characterWordCounts.reduceByKey((x, y) => x + y)

In [127]:
characterWordCounts.take(10)

Array(((marty,job),1), ((jennifer,world),1), ((marty,lovers),1), ((marty,hell),3), ((doc,kind),1), ((skinhead,gonna),1), ((lou,pall),1), ((marty,night),4), ((doc,sucker's),1), ((goldie,idea),1))

In [128]:
val characterWordCountsReversed = characterWordCounts.map(x => (x._2, x._1))

In [129]:
val characterWordCountsSorted = characterWordCountsReversed.sortByKey(false)

In [130]:
characterWordCountsSorted.take(10)

Array((57,(marty,doc)), (29,(marty,uh)), (27,(marty,yeah)), (25,(marty,george)), (20,(doc,marty)), (19,(biff,mcfly)), (17,(marty,alright)), (17,(marty,hey)), (15,(george,biff)), (15,(marty,wait)))

In [135]:
var characterWordCounts2 = characterWordCountsSorted.map(x => (x._2._1, (x._2._2,x._1)))

In [136]:
// Let's see what it looks like
characterWordCounts2.take(10)

Array((marty,(doc,57)), (marty,(uh,29)), (marty,(yeah,27)), (marty,(george,25)), (doc,(marty,20)), (biff,(mcfly,19)), (marty,(alright,17)), (marty,(hey,17)), (george,(biff,15)), (marty,(wait,15)))

In [138]:
characterWordCounts2 = characterWordCounts2.reduceByKey((x, y) => if (x._2 > y._2) x else y)

In [139]:
val characterWordCounts2Reversed = characterWordCounts2.map(x => (x._2._2, (x._1,x._2._1)))

In [140]:
val characterWordCounts2Sorted = characterWordCounts2Reversed.sortByKey(false)

In [141]:
// Finally! Here are the top words for the 10 characters with the most lines
characterWordCounts2Sorted.take(10)

Array((57,(marty,doc)), (20,(doc,marty)), (19,(biff,mcfly)), (15,(george,biff)), (13,(loraine,marty)), (6,(woman,clock)), (6,(strickland,mcfly)), (5,(goldie,mayor)), (5,(jennifer,marty)), (4,(stella,marty)))