# Introduction to Spark

The following code sets up the "Spark Context" which is how we interact with Spark from Python.

In [1]:
import pyspark

sc = pyspark.SparkContext()

## Word Count

In [2]:
def countWords (fileName):
    textfile = sc.textFile(fileName)
    lines = textfile.flatMap(lambda line: line.split(" "))
    counts = lines.map (lambda word: (word, 1))
    aggregatedCounts = counts.reduceByKey (lambda a, b: a + b)
    return aggregatedCounts.top (200, key=lambda p : p[1])

In [3]:
countWords("../data/20-news-same-line-small.txt")

[('', 38650),
 ('the', 7562),
 ('to', 5275),
 ('a', 4911),
 ('and', 4646),
 ('of', 4458),
 ('is', 3257),
 ('I', 2900),
 ('for', 2883),
 ('in', 2648),
 ('that', 1781),
 ('you', 1746),
 ('on', 1631),
 ('it', 1523),
 ('be', 1461),
 ('with', 1419),
 ('or', 1352),
 ('have', 1231),
 ('are', 1212),
 ('The', 1169),
 ('can', 1093),
 ('From:', 1065),
 ('this', 1060),
 ('from', 1035),
 ('as', 1029),
 ('Subject:', 1020),
 ('Date:', 1012),
 ('Lines:', 1002),
 ('<doc', 1000),
 ('url=""', 1000),
 ('</doc>', 1000),
 ('Apr', 902),
 ('GMT', 868),
 ('-', 860),
 ('image', 844),
 ('an', 818),
 ('not', 814),
 ('but', 800),
 ('will', 772),
 ('by', 758),
 ('at', 750),
 ('1993', 732),
 ('if', 637),
 ('about', 583),
 ('would', 581),
 ('any', 574),
 ('|', 566),
 ('some', 540),
 ('JPEG', 530),
 ('file', 517),
 ('If', 515),
 ('has', 512),
 ('do', 509),
 ('which', 493),
 ('In', 476),
 ('It', 475),
 ('Re:', 473),
 ('This', 461),
 ('other', 445),
 ('was', 443),
 ('there', 427),
 ('use', 427),
 ('all', 427),
 ('get', 

## Completing a Spark Program

In [47]:
import re
import numpy as np

# load up all of the 19997 documents in the corpus
corpus = sc.textFile ("../data/20-news-same-line-small.txt")

# each entry in validLines will be a line from the text file
validLines = corpus.filter(lambda x : 'id' in x)

# now we transform it into a bunch of (docID, text) pairs
keyAndText = validLines.map(lambda x : (x[x.index('id="') + 4 : x.index('" url=')], x[x.index('">') + 2:]))

# now we split the text in each (docID, text) pair into a list of words
# after this, we have a data set with (docID, ["word1", "word2", "word3", ...])
# we have a bit of fancy regular expression stuff here to make sure that we do not
# die on some of the documents
regex = re.compile('[^a-zA-Z]')
keyAndListOfWords = keyAndText.map(lambda x : (str(x[0]), regex.sub(' ', x[1]).lower().split()))

# now get the top 20,000 words... first change (docID, ["word1", "word2", "word3", ...])
# to ("word1", 1) ("word2", 1)...
allWords = keyAndListOfWords.flatMap(lambda x: ((j, 1) for j in x[1]))

# now, count all of the words, giving us ("word1", 1433), ("word2", 3423423), etc.
allCounts = allWords.reduceByKey (lambda a, b: a + b)

# and get the top 20,000 words in a local array
# each entry is a ("word1", count) pair
topWords = allCounts.top (20000, lambda x : x[1])

# and we'll create a RDD that has a bunch of (word, dictNum) pairs
# start by creating an RDD that has the number 0 thru 20000
# 20000 is the number of words that will be in our dictionary
twentyK = sc.parallelize(range(10000))

### Your Code Here

Your task is to map the parallelized range $(0, 1, 2, \ldots, )$ 
into a set of tuples ("mostcommonword", 0), ("nextmostcommon", 1), ....

In [48]:
# now, we transform (0), (1), (2), ... to ("mostcommonword", 0) ("nextmostcommon", 1), ...
# the number will be the spot in the dictionary used to tell us where the word is located
# HINT: make use of topWords in the lambda that you supply
dictionary = twentyK.map(lambda x : (topWords[x][0], x) )

# finally, print out some of the dictionary, just for debugging
dictionary.top (10)

[('zyxel', 4580),
 ('zyeh', 2058),
 ('zyda', 2698),
 ('zvi', 9584),
 ('zurich', 8040),
 ('zug', 8075),
 ('zorg', 4647),
 ('zopfi', 8548),
 ('zooming', 2994),
 ('zoom', 2193)]