In [None]:
pyspark --master yarn # starts spark on whole yarn cluster
lines = sc.textFile("/user/pascepet/data/bible.txt") # loads file from hdfs

# returns n-elements (since it's distributed it can be any first n lines spark gets)
lines.take(5) 

# Chain of transforms of RDDs
words = lines.flatMap(lambda line: line.split(" ")) # appends lists of new elements created by the lambda
pairs = words.map(lambda word: (word, 1))
counts = pairs.reduceByKey(lambda a, b: a+b)    # 2-element tuple is interpreted as (key, value)
                                                # no implicit key before it gets created 

# Counts isn't evaluated yet (lazy)
counts.take(10) # now it gets evaluated
counts.count()

# By default as soon as sparks computes and returns an answer it forgets all intermediate calculations
# Multiple calls to `counts.take(10)` always triggers whole chain of re-computation
# It's possible to cache data -> metacentrum has some caching policy enabled
# see presentations for more info

# Spark works on graph of transformations, once spark sees a cached node it doesn't evaluate deeper
countsSorted = counts.sortBy(lambda a: a[1], ascending=False)
countsSorted.cache() #1.1

# Transform all to lowercase...
wordsSmall = words.map(lambda w: w.lower()) #1.3
lines = lines.map(lambda l: w.split("\t")[0]) #1.4
lines = lines.filter(lambda l: l != "") # Filters out empty lines


