<a href="https://colab.research.google.com/github/manoj7pal/Google-Colab-Notebooks/blob/master/2_RDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RDD has 2 operations:

1. Transformation : 
    - new calculations over the dataset, 
    - create new paritions with transformed data in the form of RDD, 
    - It has mapping os the raw dataset and the transformed dataset.
    - Executes on Spark Cluster or Worker Nodes

2. Action
    - it converts RDD into human readable format 
    - Functions like print, display, count(), take()
    - Executes on the Driver or the Master nodes.


In [1]:
# !pip3 install pyspark

In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName('RDD_practice').setMaster('local[*]')
sc = SparkContext(conf = conf)
print(sc.defaultParallelism)
print(sc)

2
<SparkContext master=local[*] appName=RDD_practice>


Section 1 - Create RDD and Basic Operations

In [54]:
# Generate Random Data

import random

randomList = random.sample(range(0,20), 10)
print(randomList)


[0, 2, 3, 4, 19, 15, 16, 9, 6, 8]


In [55]:
# Create RDD

rdd1 = sc.parallelize(randomList,4 ) # 4: no of partitions

print(type(rdd1))
rdd1.collect()

<class 'pyspark.rdd.RDD'>


[0, 2, 3, 4, 19, 15, 16, 9, 6, 8]

In [56]:
# Data Distribution in partition: tries to max numbers of equal partitions

print(rdd1.getNumPartitions())
print(rdd1.glom().collect())

print(rdd1.glom().take(2) )

4
[[0, 2], [3, 4], [19, 15], [16, 9, 6, 8]]
[[0, 2], [3, 4]]


In [57]:
# count()

rdd1.count()

10

In [58]:
# first()

rdd1.first()

0

In [59]:
# top()

rdd1.top(2)

[19, 16]

# Transformation Functions

In [60]:
# distinct() 

rdd1.distinct().collect()

[0, 4, 16, 8, 9, 2, 6, 3, 19, 15]

In [61]:
# map() - can map a function to a list of values

def fun1(num):
  return num**2

rdd_map = rdd1.map(fun1)

print(randomList)
print(rdd_map.collect())

[0, 2, 3, 4, 19, 15, 16, 9, 6, 8]
[0, 4, 9, 16, 361, 225, 256, 81, 36, 64]


In [78]:
# simple map - using multiple o/p expressions

rdd_fm = rdd1.map(lambda x: [x**2, x**3])

print(rdd1.collect())
print(rdd_fm.collect())

print("--"*60)

print(rdd1.glom().collect())
print(rdd_fm.glom().collect())

[0, 2, 3, 4, 19, 15, 16, 9, 6, 8]
[[0, 0], [4, 8], [9, 27], [16, 64], [361, 6859], [225, 3375], [256, 4096], [81, 729], [36, 216], [64, 512]]
------------------------------------------------------------------------------------------------------------------------
[[0, 2], [3, 4], [19, 15], [16, 9, 6, 8]]
[[[0, 0], [4, 8]], [[9, 27], [16, 64]], [[361, 6859], [225, 3375]], [[256, 4096], [81, 729], [36, 216], [64, 512]]]


In [62]:
print(rdd_map.glom().collect())

[[0, 4], [9, 16], [361, 225], [256, 81, 36, 64]]


In [63]:
rdd_map = rdd1.map( lambda x: x**2 )
print(rdd_map.collect())

[0, 4, 9, 16, 361, 225, 256, 81, 36, 64]


In [66]:
# filter

rdd_filter = rdd1.filter(lambda x: x>5)
print(randomList)
print(rdd_filter.collect())

[0, 2, 3, 4, 19, 15, 16, 9, 6, 8]
[19, 15, 16, 9, 6, 8]


In [67]:
print(rdd1.glom().collect())
print(rdd_filter.glom().collect())

[[0, 2], [3, 4], [19, 15], [16, 9, 6, 8]]
[[], [], [19, 15], [16, 9, 6, 8]]


In [68]:
print(rdd1.count())
print(rdd_filter.count())

10
6


In [69]:
#Repartitioning the RDD - when the items are filtered/removed out

if rdd_filter.count() >=4:
  new_rdd_filter = rdd_filter.repartition(2)

print(rdd_filter.glom().collect())  
print(new_rdd_filter.glom().collect())  

[[], [], [19, 15], [16, 9, 6, 8]]
[[19, 15, 16, 9, 6, 8], []]


In [80]:
# flatMap() and reduce() - map to a collection of o/p expressions, 
  # and performs aggregations using reduce()

rdd_fm = rdd1.flatMap(lambda x: [x**2, x**3])

print(rdd1.collect)
print(rdd_fm.collect())


<bound method RDD.collect of ParallelCollectionRDD[131] at readRDDFromFile at PythonRDD.scala:274>
[0, 0, 4, 8, 9, 27, 16, 64, 361, 6859, 225, 3375, 256, 4096, 81, 729, 36, 216, 64, 512]


In [81]:
print(rdd1.glom().collect())
print(rdd_fm.glom().collect())

[[0, 2], [3, 4], [19, 15], [16, 9, 6, 8]]
[[0, 0, 4, 8], [9, 27, 16, 64], [361, 6859, 225, 3375], [256, 4096, 81, 729, 36, 216, 64, 512]]


In [82]:
rdd_fm.reduce(lambda x,y : x+y)

16938

In [84]:
# Descriptive Statistics:

print(rdd1.max(), rdd1.min(), rdd1.count(), rdd1.mean(), round(rdd1.stdev(),2), rdd1.sum())

19 0 10 8.2 6.16 82


In [90]:
# mapPartitions() : Map a function to each parition

def fun1(partition):
  sum=0

  for item in partition:
    sum += item
  
  yield sum

rdd_map_part = rdd1.mapPartitions( fun1 ).collect()

print(rdd1.glom().collect())
print(rdd_map_part)

[[0, 2], [3, 4], [19, 15], [16, 9, 6, 8]]
[2, 7, 34, 39]
