In [1]:
import findspark
findspark.init()

In [2]:
import pandas as pd
from pyspark import SparkContext, SparkConf, SQLContext
sc = SparkContext()

In [3]:
rdd = sc.textFile('blogtexts')
rdd.take(5)

['Think of it for a moment – 1 Qunitillion = 1 Million Billion! Can you imagine how many drives / CDs / Blue-ray DVDs would be required to store them? It is difficult to imagine this scale of data generation even as a data science professional. While this pace of data generation is very exciting,  it has created entirely new set of challenges and has forced us to find new ways to handle Big Huge data effectively.',
 '',
 'Big Data is not a new phenomena. It has been around for a while now. However, it has become really important with this pace of data generation. In past, several systems were developed for processing big data. Most of them were based on MapReduce framework. These frameworks typically rely on use of hard disk for saving and retrieving the results. However, this turns out to be very costly in terms of time and speed.',
 '',
 'On the other hand, Organizations have never been more hungrier to add a competitive differentiation through understanding this data and offering it

## General transformations

### Transformation: map and flatMap

In [4]:
def Func(lines):
    lines = lines.lower()
    lines = lines.split()
    return lines
rdd1 = rdd.map(Func)
rdd1.take(5)

[['think',
  'of',
  'it',
  'for',
  'a',
  'moment',
  '–',
  '1',
  'qunitillion',
  '=',
  '1',
  'million',
  'billion!',
  'can',
  'you',
  'imagine',
  'how',
  'many',
  'drives',
  '/',
  'cds',
  '/',
  'blue-ray',
  'dvds',
  'would',
  'be',
  'required',
  'to',
  'store',
  'them?',
  'it',
  'is',
  'difficult',
  'to',
  'imagine',
  'this',
  'scale',
  'of',
  'data',
  'generation',
  'even',
  'as',
  'a',
  'data',
  'science',
  'professional.',
  'while',
  'this',
  'pace',
  'of',
  'data',
  'generation',
  'is',
  'very',
  'exciting,',
  'it',
  'has',
  'created',
  'entirely',
  'new',
  'set',
  'of',
  'challenges',
  'and',
  'has',
  'forced',
  'us',
  'to',
  'find',
  'new',
  'ways',
  'to',
  'handle',
  'big',
  'huge',
  'data',
  'effectively.'],
 [],
 ['big',
  'data',
  'is',
  'not',
  'a',
  'new',
  'phenomena.',
  'it',
  'has',
  'been',
  'around',
  'for',
  'a',
  'while',
  'now.',
  'however,',
  'it',
  'has',
  'become',
  'reall

In [5]:
rdd2 = rdd.flatMap(Func)
rdd2.take(5)

['think', 'of', 'it', 'for', 'a']

### Transformation: filter

In [6]:
stopwords = ['is','am','are','the','for','a']
rdd3 = rdd2.filter(lambda x: x not in stopwords)
rdd3.take(10)

['think', 'of', 'it', 'moment', '–', '1', 'qunitillion', '=', '1', 'million']

### Transformation: groupBy

In [7]:
rdd4 = rdd3.groupBy(lambda w:w[0:3])
print([(k,list(v)) for (k,v) in rdd4.take(1)])

[('thi', ['think', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'think', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'things', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this', 'this.', 'this', 'this', 'things', 'this', 'this', 'this'])]


### Transformation: groupByKey/reduceByKey

In [9]:
rdd3_mapped = rdd3.map(lambda x:(x,1))
rdd3_grouped = rdd3_mapped.groupByKey()
print(list((j[0],list(j[1]))  for j in rdd3_grouped.take(5)))

[('think', [1, 1]), ('of', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), ('1', [1, 1]), ('qunitillion', [1]), ('=', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]


In [15]:
rdd3_freq_of_words = rdd3_grouped.mapValues(sum) \
                .map(lambda x:(x[1],x[0])) \
                .sortByKey(False)

In the above code, I first applied “**mapValues**” transformation on “rdd3_grouped”. The “mapValues” (only applicable on pair RDD) transformation is like a map (can be applied on any RDD) transform but it has one difference that when we apply map transform on pair RDD we can access the key and value both of this RDD but in case of “mapValues” transformation, it will transform the values by applying some function and key will not be affected.

In [16]:
rdd3_freq_of_words.take(10)

[(164, 'to'),
 (143, 'in'),
 (122, 'of'),
 (106, 'and'),
 (103, 'we'),
 (69, 'spark'),
 (64, 'this'),
 (63, 'data'),
 (55, 'can'),
 (52, 'apache')]

In [17]:
rdd3_mapped.reduceByKey(lambda x,y:x+y) \
        .map(lambda x:(x[1],x[0])) \
        .sortByKey(False).take(10)

[(164, 'to'),
 (143, 'in'),
 (122, 'of'),
 (106, 'and'),
 (103, 'we'),
 (69, 'spark'),
 (64, 'this'),
 (63, 'data'),
 (55, 'can'),
 (52, 'apache')]

![reducebykey](https://www.analyticsvidhya.com/wp-content/uploads/2016/10/reduceByKey-3.png)
![groupbykey](https://www.analyticsvidhya.com/wp-content/uploads/2016/10/groupbykey.png)

### Transformation: mapPartitions

In [20]:
def func(iterator):
    count_spark = 0
    count_apache = 0
    for i in iterator:
        if i=='spark':
            count_spark += 1
        if i=='apache':
            count_apache += 1
            
    return(count_spark,count_apache)

rdd3.mapPartitions(func).glom().collect()

[[49, 39], [20, 13]]

I have used the “**glom**” function which is very useful when we want to see the data insights for each partition of a RDD. So above result shows that 49,39 are the counts of ‘spark’, ‘apache’ in partition1 and 20,13 are the counts of ‘spark’, ‘apache’ in partition2. If we won’t use the “glom” function we won’t we able to see the results of each partition.

In [27]:
rdd3.mapPartitions(func).collect()

[49, 39, 20, 13]

## Math/Statistical Transformation

### Transformation: sample
We can pass the arguments insights as the sample operation:

“**withReplacement** = True” or False (to choose the sample with or without replacement)
“**fraction** = x” ( x= .4 means we want to choose 40% of data in “rdd” ) and “seed” for reproduce the results.

In [28]:
rdd3_sampled = rdd3.sample(False,0.4,42)
len(rdd3.collect()),len(rdd3_sampled.collect())

(4768, 1872)

## Set Theory/Relational Transformation

### Transformation: union

In [31]:
sample1 = rdd3.sample(False,0.2,42)
sample2 = rdd3.sample(False,0.2,42)
union_of_sample1_sample2 = sample1.union(sample2)
len(sample1.collect()), len(sample2.collect()),len(union_of_sample1_sample2.collect())

(931, 931, 1862)

union operation didn’t remove the duplicate elements.

### Transformation: join

In [34]:
sample1 = rdd3_mapped.sample(False,.2,42)
sample2 = rdd3_mapped.sample(False,.2,42)
join_on_sample1_sample2 = sample1.join(sample2)
join_on_sample1_sample2.take(2)

[('think', (1, 1)), ('even', (1, 1))]

### Transformation: distinct

In [35]:
rdd3_distinct = rdd3.distinct()
len(rdd3_distinct.collect())

1485

## Data Structure/I/O Transformation

### Transformation: coalesce

In [36]:
rdd3.getNumPartitions()

2

In [37]:
rdd3_coalesce = rdd3.coalesce(1)
rdd3_coalesce.getNumPartitions()

1

## General Actions

### Action: getNumPartitions
With “getNumPartitions”, we can find out that how many partitions exist in our RDD

### Action: Reduce

In [38]:
num_rdd = sc.parallelize(range(1,1000))
num_rdd.reduce(lambda x,y:x+y)

499500

## Mathematical/Statistical Actions

### Action: count

In [39]:
rdd3.count()

4768

### Action: max,min,sum,variance and stdev

In [41]:
num_rdd.max(),num_rdd.min(),num_rdd.sum(),num_rdd.variance(),num_rdd.stdev()

(999, 1, 499500, 83166.66666666667, 288.38631497813253)