# Create SparkContext

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Create pair RDD from RDD of loading text file

In [10]:
import random

input_data_path = '../week1/big_data_intro.txt'
text_file = sc.textFile(input_data_path)

word_to_length_pair_rdd = text_file.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, random.random()))

print(word_to_length_pair_rdd.take(10))

[('Introduction', 0.06659323731178812), ('Big', 0.6019054800165461), ('data', 0.6316725022310598), ('is', 0.6048955313938121), ('a', 0.19320798564262665), ('blanket', 0.4760673338516238), ('term', 0.4073858823715758), ('for', 0.09332026308071528), ('the', 0.38652625320067413), ('non-traditional', 0.23451162109608492)]


# Demo tranformation - keys()

In [11]:
word_to_length_pair_keys_rdd = word_to_length_pair_rdd.keys()

print(word_to_length_pair_keys_rdd.take(10))

['Introduction', 'Big', 'data', 'is', 'a', 'blanket', 'term', 'for', 'the', 'non-traditional']


# Demo transformation - values()

In [12]:
word_to_length_pair_values_rdd = word_to_length_pair_rdd.values()
print(word_to_length_pair_values_rdd.take(10))

[0.5262972192425175, 0.275105530725299, 0.014762469849213034, 0.16842970170760163, 0.580890472398784, 0.5977490094390006, 0.3918114982918476, 0.3702962622707826, 0.6666599819830247, 0.42069137597758766]


In [13]:
word_to_length_pair_group_rdd = word_to_length_pair_rdd.groupByKey()

group_result = word_to_length_pair_group_rdd.take(10)
for key, values in group_result:
    print('key = {}'.format(key))
    for value in values:
        print(value)

key = is
0.3223510764478078
0.5717124043578006
0.837033195438256
0.5103507376523803
0.7403284265237835
0.17971275848127566
0.4255501790830092
0.4880873618646695
0.9225068862863768
0.9418359557605199
0.9857264232616598
0.09882430563342881
0.6570156796419482
0.9754607769813225
0.48813534873125997
0.6977051974090042
0.44089059806923525
0.2869723935063746
0.15237839086027527
0.7986694972768231
0.09133793665709189
0.4237933360811421
0.720158640563486
0.9039725282102399
0.4534281566085403
0.7241224525286324
0.5680177913782257
0.5828107610420503
0.9119873796002588
0.05288298412099712
0.29741046010966576
0.8220375998403998
0.5145568835493479
0.9902379472467424
0.9130307309453848
0.09457951000786324
0.8942610898566419
0.8983915422823726
0.2309145870646374
0.9145325347519898
0.2358118805095799
0.6364613175100698
0.6550697272269663
0.8233352116211494
0.8478583545206029
0.2549151448575263
0.07302734880220796
0.5179177393382548
0.4546248891657859
0.26251809650363545
0.055487403482789355
0.292539134

In [14]:
word_to_length_pair_reduce_rdd = word_to_length_pair_rdd.reduceByKey(lambda a, b: a + b)
print(word_to_length_pair_reduce_rdd.take(10))

[('is', 31.312902250069808), ('term', 2.940584201660485), ('non-traditional', 0.0302325736417971), ('needed', 1.0367739191795897), ('gather', 1.383135494575228), ('organize', 1.284718689328098), ('process', 8.844317496688344), ('large', 9.955189682610467), ('datasets', 4.575313446701122), ('of', 55.1374901600131)]


In [15]:
bigdata_word_to_count_pair_rdd = sc.textFile('../week1/big_data_intro.txt') \
    .flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)
#bigdata_word_to_count_pair_rdd = bigdata_word_to_count_pair_rdd.repartition(len(bigdata_word_to_count_pair_rdd.collect()))

print(bigdata_word_to_count_pair_rdd.take(10))

hamlet_word_to_count_pair_rdd = sc.textFile('./hamlet.txt') \
    .flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)
#hamlet_word_to_count_pair_rdd = hamlet_word_to_count_pair_rdd.repartition(len(hamlet_word_to_count_pair_rdd.collect()))

print(hamlet_word_to_count_pair_rdd.take(10))

[('is', 59), ('term', 7), ('non-traditional', 1), ('needed', 2), ('gather', 2), ('organize', 2), ('process', 15), ('large', 18), ('datasets', 11), ('of', 111)]
[('', 186), ('The', 149), ('Tragedy', 1), ('of', 623), ('Prince', 1), ('Denmark', 9), ('Shakespeare', 1), ('|', 2), ('Entire', 1), ('ACT', 5)]


In [16]:
bigdata_join_hamlet_rdd = bigdata_word_to_count_pair_rdd.join(hamlet_word_to_count_pair_rdd)
print(bigdata_join_hamlet_rdd.collect())

[('is', (59, 291)), ('term', (7, 2)), ('gather', (2, 1)), ('process', (15, 1)), ('large', (18, 1)), ('of', (111, 623)), ('working', (4, 1)), ('power', (1, 7)), ('single', (7, 2)), ('new', (4, 2)), ('this', (13, 204)), ('in', (41, 387)), ('years', (1, 1)), ('', (38, 186)), ('we', (7, 107)), ('common', (4, 5)), ('subject', (1, 2)), ('take', (3, 26)), ('look', (2, 16)), ('at', (8, 75)), ('used', (13, 1)), ('What', (2, 77)), ('down', (2, 11)), ('business', (2, 8)), ('use', (2, 14)), ('quite', (2, 3)), ('mind', (2, 7)), ('are', (35, 119)), ('means', (5, 14)), ('may', (2, 58)), ('The', (18, 149)), ('as', (17, 146)), ('speed', (3, 3)), ('must', (1, 53)), ('stage', (2, 2)), ('when', (5, 28)), ('would', (2, 63)), ('possible', (2, 1)), ('known', (2, 5)), ('three', (3, 9)), ('make', (5, 45)), ('different', (6, 1)), ('other', (9, 11)), ('These', (5, 7)), ('larger', (1, 1)), ('than', (3, 38)), ('demands', (2, 1)), ('more', (11, 69)), ('thought', (1, 6)), ('work', (9, 5)), ('exceed', (1, 1)), ('capa