# How to initialize Spark?

In [79]:
from pyspark import SparkContext, SparkConf
conf = pyspark.SparkConf().setAppName("Spark Tutorials")
sc = SparkContext.getOrCreate(conf=conf)

By creating a  SparkContext object, we are telling Spark to access the cluster.


# RDDs

Spark works on resilient distributed dataset (RDD). 
RDD's can be partitioned across multipe nodes and operations can be done in parallel.
RDD's are immutable.

In [81]:
dataset = sc.textFile("inputs.txt")
dataset.take(10)

['Spark',
 'Tutorials ',
 'By ',
 'Python',
 '-',
 'Machine Learning',
 '.com',
 'Spark',
 'Is',
 'Used']

 map(operation): Returns a new rdd after applying operation to each element.

In [82]:
map_example=dataset.map(lambda x: (x,1))
map_example.take(20)

[('Spark', 1),
 ('Tutorials ', 1),
 ('By ', 1),
 ('Python', 1),
 ('-', 1),
 ('Machine Learning', 1),
 ('.com', 1),
 ('Spark', 1),
 ('Is', 1),
 ('Used', 1),
 ('For ', 1),
 ('Processing', 1),
 ('Big', 1),
 ('Data', 1),
 ('And', 1),
 ('Machine Learning', 1),
 ('In', 1),
 ('Various', 1),
 ('Industries.', 1)]

filter(operation) : Returns a new RDD after selecting the elements on which operation returns True.

In [84]:
filter_example=dataset.filter(lambda x: len(x)>5)
filter_example.take(20)

['Tutorials ',
 'Python',
 'Machine Learning',
 'Processing',
 'Machine Learning',
 'Various',
 'Industries.']

groupByKey(operation) : Returns a new RDD where the values for each key are grouped by the key.

In [89]:
groupByKey_example=map_example.groupByKey()
groupByKey_example.take(10)

[('Spark', <pyspark.resultiterable.ResultIterable at 0x10e936350>),
 ('By ', <pyspark.resultiterable.ResultIterable at 0x10e936310>),
 ('Python', <pyspark.resultiterable.ResultIterable at 0x10e9364d0>),
 ('Machine Learning', <pyspark.resultiterable.ResultIterable at 0x10e936390>),
 ('.com', <pyspark.resultiterable.ResultIterable at 0x10e936590>),
 ('And', <pyspark.resultiterable.ResultIterable at 0x10e936610>),
 ('Various', <pyspark.resultiterable.ResultIterable at 0x10e936690>),
 ('Tutorials ', <pyspark.resultiterable.ResultIterable at 0x10e92c910>),
 ('-', <pyspark.resultiterable.ResultIterable at 0x10e92cb50>),
 ('Is', <pyspark.resultiterable.ResultIterable at 0x10e92c790>)]

reduceByKey(operation) : Returns a new RDD where the values for each key are reduced using the given operation function.

In [90]:
reduceByKey_example = map_example.reduceByKey(lambda a, b: a + b)
reduceByKey_example.take(20)

[('Spark', 2),
 ('By ', 1),
 ('Python', 1),
 ('Machine Learning', 2),
 ('.com', 1),
 ('And', 1),
 ('Various', 1),
 ('Tutorials ', 1),
 ('-', 1),
 ('Is', 1),
 ('Used', 1),
 ('For ', 1),
 ('Processing', 1),
 ('Big', 1),
 ('Data', 1),
 ('In', 1),
 ('Industries.', 1)]

sortByKey(operation) : Returns a new RDD where the values for each key are sorted using the given sort function.

In [92]:
sortByKey_example = map_example.sortByKey()
sortByKey_example.take(20)

[('-', 1),
 ('.com', 1),
 ('And', 1),
 ('Big', 1),
 ('By ', 1),
 ('Data', 1),
 ('For ', 1),
 ('In', 1),
 ('Industries.', 1),
 ('Is', 1),
 ('Machine Learning', 1),
 ('Machine Learning', 1),
 ('Processing', 1),
 ('Python', 1),
 ('Spark', 1),
 ('Spark', 1),
 ('Tutorials ', 1),
 ('Used', 1),
 ('Various', 1)]

coalesce(number) : Returns an RDD with decreased number of partitions as given.

In [93]:
coalesce_example=map_example.coalesce(5)
coalesce_example

CoalescedRDD[203] at coalesce at NativeMethodAccessorImpl.java:0

repartition(number) : Returns a new RDD with more or less number of partitions in a randomly shuffled manner.

In [94]:
repartition_example=map_example.repartition(1)
repartition_example

MapPartitionsRDD[208] at coalesce at NativeMethodAccessorImpl.java:0

# Common Functions in Spark

saveAsTextFile : For saving a RDD as a textfile.

In [97]:
filter_example.saveAsTextFile("savefile.csv")

count : Returns the number of elements in a RDD.

In [98]:
count_example=map_example.count()
count_example

19

 first : Returns the first element in the RDD.

In [100]:
filter_example.first()

'Tutorials '

take(n) : Returns an array of first n number of elements.

In [102]:
filter_example.take(5)

['Tutorials ', 'Python', 'Machine Learning', 'Processing', 'Machine Learning']