# Spark RDD programming 

## Spark Context

In [31]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# RDD

## Create RDD

### Create RDD from Parallelized Collections

In [19]:
data = [1, 2, 3, 4, 5]
data_rdd = sc.parallelize(data)

### Create RDD from External Datasets - such as local file, HDFS, 

In [32]:
from_file_rdd = sc.textFile('../week1/big_data_intro.txt')

In [33]:
from_file_rdd

../week1/big_data_intro.txt MapPartitionsRDD[63] at textFile at NativeMethodAccessorImpl.java:0

## RDD Operations

In [28]:
def demo_func(v):
    return v*v*v
    
result = data_rdd.map(demo_func)
print(result.collect())

[1, 8, 27, 64, 125]


## Word Count Example

In [39]:
input_data_path = '../week1/big_data_intro.txt'
input_data_rdd = sc.textFile(input_data_path)
print(input_data_rdd.getNumPartitions())

2


In [40]:
print(input_data_rdd.count())

117


In [43]:
#word_count_rdd = input_data_rdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

splitted_word_rdd = input_data_rdd.flatMap(lambda line: line.split())
print(splitted_word_rdd.count())
debug_result1 = splitted_word_rdd.collect()
print(debug_result1)

3069
['Introduction', 'Big', 'data', 'is', 'a', 'blanket', 'term', 'for', 'the', 'non-traditional', 'strategies', 'and', 'technologies', 'needed', 'to', 'gather', 'organize', 'process', 'and', 'gather', 'insights', 'from', 'large', 'datasets', 'While', 'the', 'problem', 'of', 'working', 'with', 'data', 'that', 'exceeds', 'the', 'computing', 'power', 'or', 'storage', 'of', 'a', 'single', 'computer', 'is', 'not', 'new', 'the', 'pervasiveness', 'scale', 'and', 'value', 'of', 'this', 'type', 'of', 'computing', 'has', 'greatly', 'expanded', 'in', 'recent', 'years', 'In', 'this', 'article', 'we', 'will', 'talk', 'about', 'big', 'data', 'on', 'a', 'fundamental', 'level', 'and', 'define', 'common', 'concepts', 'you', 'might', 'come', 'across', 'while', 'researching', 'the', 'subject', 'We', 'will', 'also', 'take', 'a', 'high-level', 'look', 'at', 'some', 'of', 'the', 'processes', 'and', 'technologies', 'currently', 'being', 'used', 'in', 'this', 'space', 'What', 'Is', 'Big', 'Data?', 'An', 'ex

In [44]:
word_map_rdd = splitted_word_rdd.map(lambda word: (word, 1))
print(word_map_rdd.count())
print(word_map_rdd.collect())

3069
[('Introduction', 1), ('Big', 1), ('data', 1), ('is', 1), ('a', 1), ('blanket', 1), ('term', 1), ('for', 1), ('the', 1), ('non-traditional', 1), ('strategies', 1), ('and', 1), ('technologies', 1), ('needed', 1), ('to', 1), ('gather', 1), ('organize', 1), ('process', 1), ('and', 1), ('gather', 1), ('insights', 1), ('from', 1), ('large', 1), ('datasets', 1), ('While', 1), ('the', 1), ('problem', 1), ('of', 1), ('working', 1), ('with', 1), ('data', 1), ('that', 1), ('exceeds', 1), ('the', 1), ('computing', 1), ('power', 1), ('or', 1), ('storage', 1), ('of', 1), ('a', 1), ('single', 1), ('computer', 1), ('is', 1), ('not', 1), ('new', 1), ('the', 1), ('pervasiveness', 1), ('scale', 1), ('and', 1), ('value', 1), ('of', 1), ('this', 1), ('type', 1), ('of', 1), ('computing', 1), ('has', 1), ('greatly', 1), ('expanded', 1), ('in', 1), ('recent', 1), ('years', 1), ('In', 1), ('this', 1), ('article', 1), ('we', 1), ('will', 1), ('talk', 1), ('about', 1), ('big', 1), ('data', 1), ('on', 1), (

In [45]:
word_count_rdd = word_map_rdd.reduceByKey(lambda a, b: a + b)

print(word_count_rdd.count())
print(word_count_rdd.collect())

931
[('is', 59), ('term', 7), ('non-traditional', 1), ('needed', 2), ('gather', 2), ('organize', 2), ('process', 15), ('large', 18), ('datasets', 11), ('of', 111), ('working', 4), ('power', 1), ('storage', 9), ('single', 7), ('new', 4), ('value', 5), ('this', 13), ('type', 5), ('in', 41), ('years', 1), ('we', 7), ('fundamental', 1), ('common', 4), ('concepts', 3), ('researching', 1), ('subject', 1), ('take', 3), ('high-level', 1), ('look', 2), ('at', 8), ('processes', 4), ('used', 13), ('space', 2), ('What', 2), ('Data?', 1), ('down', 2), ('projects', 6), ('vendors', 1), ('business', 2), ('professionals', 1), ('use', 2), ('quite', 2), ('mind', 2), ('category', 1), ('are', 35), ('handle', 4), ('means', 5), ('store', 2), ('traditional', 5), ('tooling', 1), ('constantly', 2), ('shifting', 1), ('may', 2), ('significantly', 4), ('The', 18), ('basic', 1), ('requirements', 7), ('as', 17), ('size', 1), ('However', 2), ('massive', 1), ('speed', 3), ('characteristics', 3), ('must', 1), ('stage',

In [35]:
word_count_list = word_map_rdd.collect()
print(word_count_list)

[('Introduction', 1), ('Big', 1), ('data', 1), ('is', 1), ('a', 1), ('blanket', 1), ('term', 1), ('for', 1), ('the', 1), ('non-traditional', 1), ('strategies', 1), ('and', 1), ('technologies', 1), ('needed', 1), ('to', 1), ('gather', 1), ('organize', 1), ('process', 1), ('and', 1), ('gather', 1), ('insights', 1), ('from', 1), ('large', 1), ('datasets', 1), ('While', 1), ('the', 1), ('problem', 1), ('of', 1), ('working', 1), ('with', 1), ('data', 1), ('that', 1), ('exceeds', 1), ('the', 1), ('computing', 1), ('power', 1), ('or', 1), ('storage', 1), ('of', 1), ('a', 1), ('single', 1), ('computer', 1), ('is', 1), ('not', 1), ('new', 1), ('the', 1), ('pervasiveness', 1), ('scale', 1), ('and', 1), ('value', 1), ('of', 1), ('this', 1), ('type', 1), ('of', 1), ('computing', 1), ('has', 1), ('greatly', 1), ('expanded', 1), ('in', 1), ('recent', 1), ('years', 1), ('In', 1), ('this', 1), ('article', 1), ('we', 1), ('will', 1), ('talk', 1), ('about', 1), ('big', 1), ('data', 1), ('on', 1), ('a', 