In [0]:
# We start by defining the spark context to play with RDDs
sc = spark.sparkContext

In [0]:
# We need a S3 filepath

FILENAME = 's3://full-stack-bigdata-datasets/Big_Data/purple_rain.txt'

In [0]:
text_file = sc.textFile(FILENAME)

In [0]:
text_file

Out[4]: s3://full-stack-bigdata-datasets/Big_Data/purple_rain.txt MapPartitionsRDD[18] at textFile at NativeMethodAccessorImpl.java:0

In [0]:
text_file.take(3)

Out[5]: ['I never meant to cause you any sorrow',
 'I never meant to cause you any pain',
 'I only wanted one time to see you laughing']

In [0]:
tokenized_text = text_file.map(lambda line: line.split(' '))
tokenized_text.take(3)

Out[6]: [['I', 'never', 'meant', 'to', 'cause', 'you', 'any', 'sorrow'],
 ['I', 'never', 'meant', 'to', 'cause', 'you', 'any', 'pain'],
 ['I', 'only', 'wanted', 'one', 'time', 'to', 'see', 'you', 'laughing']]

In [0]:
tokens = text_file.flatMap(lambda line: line.split(' '))
tokens.take(10)

Out[7]: ['I', 'never', 'meant', 'to', 'cause', 'you', 'any', 'sorrow', 'I', 'never']

In [0]:
def token_to_tuple(token):
    return (token, 1)

In [0]:
partial_count = tokens.map(token_to_tuple)
partial_count

Out[9]: PythonRDD[22] at RDD at PythonRDD.scala:58

In [0]:
partial_count.take(10)

Out[10]: [('I', 1),
 ('never', 1),
 ('meant', 1),
 ('to', 1),
 ('cause', 1),
 ('you', 1),
 ('any', 1),
 ('sorrow', 1),
 ('I', 1),
 ('never', 1)]

In [0]:
grouped_by_key = partial_count.groupByKey()

In [0]:
grouped_by_key.take(3)

Out[12]: [('never', <pyspark.resultiterable.ResultIterable at 0x7f6575c46670>),
 ('cause', <pyspark.resultiterable.ResultIterable at 0x7f6575c46880>),
 ('pain', <pyspark.resultiterable.ResultIterable at 0x7f6575c46910>)]

In [0]:
first_item = grouped_by_key.take(1)[0]
first_item

Out[13]: ('never', <pyspark.resultiterable.ResultIterable at 0x7f6575a85e80>)

In [0]:
def print_item(item_as_tuple):
    token_name, occurences = item_as_tuple
    occurences_as_list = list(occurences)
    print(f"{token_name}: {occurences_as_list}")

In [0]:
for item in grouped_by_key.take(10):
    print_item(item)

never: [1, 1, 1, 1]
cause: [1, 1]
pain: [1]
only: [1, 1, 1, 1, 1, 1, 1]
in: [1, 1]
rain: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Purple: [1, 1, 1, 1, 1, 1, 1, 1, 1]
rain,: [1, 1, 1, 1, 1, 1, 1, 1, 1]
bathing: [1]


In [0]:
[(token, sum(list(occurences))) for token, occurences in grouped_by_key.take(10)]

Out[16]: [('never', 4),
 ('cause', 2),
 ('pain', 1),
 ('only', 7),
 ('in', 2),
 ('rain', 14),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('bathing', 1)]

In [0]:
def reduce_function(a, b):
    return a + b

In [0]:
reduced = partial_count.reduceByKey(reduce_function)

In [0]:
reduced.take(10)

Out[19]: [('never', 4),
 ('cause', 2),
 ('pain', 1),
 ('only', 7),
 ('in', 2),
 ('rain', 14),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('bathing', 1)]

In [0]:
fruits = [('banana', 3), ('orange', 5), ('pineapple', 2)]
fruits

Out[20]: [('banana', 3), ('orange', 5), ('pineapple', 2)]

In [0]:
sorted(fruits)

Out[21]: [('banana', 3), ('orange', 5), ('pineapple', 2)]

In [0]:
sorted(fruits, key=lambda x: x[1])

Out[22]: [('pineapple', 2), ('banana', 3), ('orange', 5)]

In [0]:
sorted_counts = reduced.sortBy(lambda t: t[1])

In [0]:
sorted_counts.take(10)

Out[24]: [('pain', 1),
 ('bathing', 1),
 ('kind', 1),
 ('of', 1),
 ('steal', 1),
 ('end', 1),
 ('underneath', 1),
 ('are', 1),
 ('we', 1),
 ('out', 1)]

In [0]:
sorted(fruits, key=lambda x: x[1], reverse=True)

Out[25]: [('orange', 5), ('banana', 3), ('pineapple', 2)]

In [0]:
desc_sorted_counts = reduced.sortBy(lambda t: -t[1])

In [0]:
desc_sorted_counts.take(10)

Out[28]: [('rain', 14),
 ('I', 14),
 ('you', 14),
 ('purple', 14),
 ('to', 13),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('only', 7),
 ('see', 6)]

In [0]:
def count_words(filepath):
    # TODO: implement the content of the function
    # 
    # NOTE: you can remove `pass`
    # it's just here to avoid the cell crashing while the
    # content of the function is empty
    pass
    ### BEGIN STRIP ###
    return sc.textFile(filepath)\
    .flatMap(lambda line: line.split(' '))\
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda t: -t[1])
    ### END STRIP ###

In [0]:
rdd = count_words(FILENAME)
type(rdd)

Out[30]: pyspark.rdd.PipelinedRDD

In [0]:
rdd.take(10)

Out[31]: [('rain', 14),
 ('I', 14),
 ('you', 14),
 ('purple', 14),
 ('to', 13),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('only', 7),
 ('see', 6)]