In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('practise').getOrCreate()

- Lazy Evaluation

In [2]:
rdd = spark.sparkContext.parallelize([1,2])

new_rdd = rdd.map(lambda x: x+1)

In [3]:
print(new_rdd)
print('waiting....')
print(new_rdd.collect())

PythonRDD[1] at RDD at PythonRDD.scala:53
waiting....
[2, 3]


- Partitions
- repartition can increase or decrease and shuffle data

In [4]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6])
print(rdd.getNumPartitions())

16


In [5]:
new_rdd = rdd.repartition(10)
print(new_rdd.getNumPartitions())

10


- coalesce()
can only decrease partitions,
can increase performance

* Spark has 3 data types :
- Rdd
- Data Frames
- Data sets - python doesnt support this

### RDD - Resilient Distributed Dataset

- Immutable.
- Low level API.
- Doesnt allow SQL support.

- RDD from dataframe

In [6]:
df = spark.read.csv('./data/SalesAnalysis.csv',header=True)
rdd = df.rdd
print(type(df),type(rdd))

<class 'pyspark.sql.dataframe.DataFrame'> <class 'pyspark.rdd.RDD'>


- Transformations

In [7]:
sc = spark.sparkContext

In [8]:
my_list = ['ksksk','sjsjhjs','shhhjs','sjjsjs']

rdd = sc.parallelize(my_list)

print(rdd.collect())

['ksksk', 'sjsjhjs', 'shhhjs', 'sjjsjs']


In [9]:
new_rdd = rdd.map(lambda x: x.upper())
print(new_rdd.collect())

['KSKSK', 'SJSJHJS', 'SHHHJS', 'SJJSJS']


In [10]:
rdd = sc.parallelize([1,2,3,4,56,999])
new_rdd = rdd.filter(lambda x : x %2 ==0)
print(new_rdd.collect())

[2, 4, 56]


In [11]:
rdd = sc.parallelize(['hello muki','hello sreya','hello jagan','hello gigi'])
new_rdd = rdd.flatMap(lambda x : x.split(" "))
print(new_rdd.collect())

['hello', 'muki', 'hello', 'sreya', 'hello', 'jagan', 'hello', 'gigi']


In [12]:
list = [(25,1000),(30,1000),(25,6000),(45,7000)]
rdd = sc.parallelize(list)
gr_rdd = rdd.groupByKey()
print(gr_rdd.collect())

[(25, <pyspark.resultiterable.ResultIterable object at 0x000001C7D8655690>), (45, <pyspark.resultiterable.ResultIterable object at 0x000001C7D8657B90>), (30, <pyspark.resultiterable.ResultIterable object at 0x000001C7D8656D50>)]


In [13]:
g_rdd = gr_rdd.mapValues(lambda x : sum(x))
print(g_rdd.collect())

[(25, 7000), (45, 7000), (30, 1000)]


In [15]:
list = [(25,1000),(30,1000),(25,6000),(45,7000),(25,4000)]
rdd = sc.parallelize(list)
gr_rdd = rdd.reduceByKey(lambda x,y: x +y)
print(gr_rdd.collect())

[(25, 11000), (45, 7000), (30, 1000)]


In [16]:
sor_rdd = rdd.sortBy(lambda x : x,ascending= False)
print(sor_rdd.collect())

[(45, 7000), (30, 1000), (25, 6000), (25, 4000), (25, 1000)]


- sort based on values

In [18]:
sor_rdd = rdd.sortBy(lambda x : x[1],ascending= False)
print(sor_rdd.collect())

[(45, 7000), (25, 6000), (25, 4000), (25, 1000), (30, 1000)]


- difference map and flatmap
- Map : takes one element and produce one element
- flatMap : takes one element and produce one or more elements
- map preserves the structure of the stream, while flatMap flattens it.

In [19]:
rdd = sc.parallelize(['hello muki','hello sreya','hello jagan','hello gigi'])
new_rdd = rdd.map(lambda x : x.split(" "))
print(new_rdd.collect())

[['hello', 'muki'], ['hello', 'sreya'], ['hello', 'jagan'], ['hello', 'gigi']]


In [20]:
rdd = sc.parallelize(['hello muki','hello sreya','hello jagan','hello gigi'])
new_rdd = rdd.flatMap(lambda x : x.split(" "))
print(new_rdd.collect())

['hello', 'muki', 'hello', 'sreya', 'hello', 'jagan', 'hello', 'gigi']


- The difference between groupByKey and reduceByKey in PySpark is that groupByKey groups all the values associated with the same key into an iterable collection, while reduceByKey combines the values of each key using a function. Both operations return a new RDD with unique keys and the corresponding values.
- groupbykey first shuffles all data so low performance.
- reducebykey first shuffles data in a node and then combines so high performance.

### Word Count

In [28]:
rdd = sc.textFile('./data/text.txt')
rdd.collect()

['Quod equidem non reprehendo;',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quibus natura iure responderit non esse verum aliunde finem beate vivendi, a se principia rei gerendae peti; Quae enim adhuc protulisti, popularia sunt, ego autem a te elegantiora desidero. Duo Reges: constructio interrete. Tum Lucius: Mihi vero ista valde probata sunt, quod item fratri puto. Bestiarum vero nullum iudicium puto. Nihil enim iam habes, quod ad corpus referas; Deinde prima illa, quae in congressu solemus: Quid tu, inquit, huc? Et homini, qui ceteris animantibus plurimum praestat, praecipue a natura nihil datum esse dicemus?',
 '',
 'Iam id ipsum absurdum, maximum malum neglegi. Quod ea non occurrentia fingunt, vincunt Aristonem; Atqui perspicuum est hominem e corpore animoque constare, cum primae sint animi partes, secundae corporis. Fieri, inquam, Triari, nullo pacto potest, ut non dicas, quid non probes eius, a quo dissentias. Equidem e Cn. An dubium est, quin virtus ita maximam 

In [22]:
new_rdd = rdd.flatMap(lambda x : x.split(" "))
new_rdd.collect()

['Quod',
 'equidem',
 'non',
 'reprehendo;',
 'Lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'Quibus',
 'natura',
 'iure',
 'responderit',
 'non',
 'esse',
 'verum',
 'aliunde',
 'finem',
 'beate',
 'vivendi,',
 'a',
 'se',
 'principia',
 'rei',
 'gerendae',
 'peti;',
 'Quae',
 'enim',
 'adhuc',
 'protulisti,',
 'popularia',
 'sunt,',
 'ego',
 'autem',
 'a',
 'te',
 'elegantiora',
 'desidero.',
 'Duo',
 'Reges:',
 'constructio',
 'interrete.',
 'Tum',
 'Lucius:',
 'Mihi',
 'vero',
 'ista',
 'valde',
 'probata',
 'sunt,',
 'quod',
 'item',
 'fratri',
 'puto.',
 'Bestiarum',
 'vero',
 'nullum',
 'iudicium',
 'puto.',
 'Nihil',
 'enim',
 'iam',
 'habes,',
 'quod',
 'ad',
 'corpus',
 'referas;',
 'Deinde',
 'prima',
 'illa,',
 'quae',
 'in',
 'congressu',
 'solemus:',
 'Quid',
 'tu,',
 'inquit,',
 'huc?',
 'Et',
 'homini,',
 'qui',
 'ceteris',
 'animantibus',
 'plurimum',
 'praestat,',
 'praecipue',
 'a',
 'natura',
 'nihil',
 'datum',
 'esse',
 'd

In [24]:
rddm = new_rdd.map(lambda x : (x,1))
rddm.collect()

[('Quod', 1),
 ('equidem', 1),
 ('non', 1),
 ('reprehendo;', 1),
 ('Lorem', 1),
 ('ipsum', 1),
 ('dolor', 1),
 ('sit', 1),
 ('amet,', 1),
 ('consectetur', 1),
 ('adipiscing', 1),
 ('elit.', 1),
 ('Quibus', 1),
 ('natura', 1),
 ('iure', 1),
 ('responderit', 1),
 ('non', 1),
 ('esse', 1),
 ('verum', 1),
 ('aliunde', 1),
 ('finem', 1),
 ('beate', 1),
 ('vivendi,', 1),
 ('a', 1),
 ('se', 1),
 ('principia', 1),
 ('rei', 1),
 ('gerendae', 1),
 ('peti;', 1),
 ('Quae', 1),
 ('enim', 1),
 ('adhuc', 1),
 ('protulisti,', 1),
 ('popularia', 1),
 ('sunt,', 1),
 ('ego', 1),
 ('autem', 1),
 ('a', 1),
 ('te', 1),
 ('elegantiora', 1),
 ('desidero.', 1),
 ('Duo', 1),
 ('Reges:', 1),
 ('constructio', 1),
 ('interrete.', 1),
 ('Tum', 1),
 ('Lucius:', 1),
 ('Mihi', 1),
 ('vero', 1),
 ('ista', 1),
 ('valde', 1),
 ('probata', 1),
 ('sunt,', 1),
 ('quod', 1),
 ('item', 1),
 ('fratri', 1),
 ('puto.', 1),
 ('Bestiarum', 1),
 ('vero', 1),
 ('nullum', 1),
 ('iudicium', 1),
 ('puto.', 1),
 ('Nihil', 1),
 ('enim', 

In [25]:
rddr = rddm.reduceByKey(lambda x,y : x+y)
rddr.collect()

[('Quod', 4),
 ('non', 11),
 ('reprehendo;', 1),
 ('Lorem', 1),
 ('dolor', 2),
 ('adipiscing', 1),
 ('Quibus', 1),
 ('esse', 8),
 ('aliunde', 1),
 ('enim', 5),
 ('adhuc', 1),
 ('protulisti,', 1),
 ('popularia', 1),
 ('autem', 4),
 ('desidero.', 1),
 ('Reges:', 1),
 ('Tum', 1),
 ('Lucius:', 1),
 ('valde', 1),
 ('quod', 7),
 ('item', 1),
 ('puto.', 2),
 ('Bestiarum', 1),
 ('nullum', 1),
 ('iam', 1),
 ('corpus', 1),
 ('Deinde', 1),
 ('prima', 1),
 ('illa,', 1),
 ('quae', 3),
 ('congressu', 1),
 ('tu,', 1),
 ('inquit,', 1),
 ('Et', 1),
 ('homini,', 1),
 ('qui', 2),
 ('ceteris', 1),
 ('nihil', 3),
 ('datum', 1),
 ('', 5),
 ('Iam', 1),
 ('id', 2),
 ('absurdum,', 1),
 ('maximum', 1),
 ('malum', 1),
 ('neglegi.', 1),
 ('ea', 2),
 ('occurrentia', 1),
 ('fingunt,', 1),
 ('vincunt', 1),
 ('Atqui', 1),
 ('est', 3),
 ('cum', 6),
 ('partes,', 1),
 ('inquam,', 2),
 ('nullo', 1),
 ('potest,', 1),
 ('ut', 6),
 ('quid', 2),
 ('probes', 1),
 ('quo', 1),
 ('Cn.', 1),
 ('dubium', 1),
 ('quin', 1),
 ('ita',

In [26]:
rdds = rddr.sortBy(lambda x : x[1],ascending=False)
rdds.collect()

[('non', 11),
 ('esse', 8),
 ('quod', 7),
 ('cum', 6),
 ('ut', 6),
 ('a', 6),
 ('enim', 5),
 ('', 5),
 ('ita', 5),
 ('et', 5),
 ('ad', 5),
 ('in', 5),
 ('Quod', 4),
 ('autem', 4),
 ('te', 4),
 ('est,', 4),
 ('si', 4),
 ('quae', 3),
 ('nihil', 3),
 ('est', 3),
 ('ipsum', 3),
 ('natura', 3),
 ('se', 3),
 ('sunt,', 3),
 ('vero', 3),
 ('rebus', 3),
 ('ipsa', 3),
 ('ne', 3),
 ('modo', 3),
 ('quam', 3),
 ('Sed', 3),
 ('mihi', 3),
 ('dolor', 2),
 ('puto.', 2),
 ('qui', 2),
 ('id', 2),
 ('ea', 2),
 ('inquam,', 2),
 ('quid', 2),
 ('me', 2),
 ('nec', 2),
 ('omnis', 2),
 ('At', 2),
 ('sed', 2),
 ('etiam', 2),
 ('haec', 2),
 ('sit', 2),
 ('verum', 2),
 ('beate', 2),
 ('Quae', 2),
 ('ista', 2),
 ('e', 2),
 ('primae', 2),
 ('dicas,', 2),
 ('An', 2),
 ('virtus', 2),
 ('M.', 2),
 ('Cum', 2),
 ('tamen', 2),
 ('igitur', 2),
 ('quidem.', 2),
 ('parum', 2),
 ('quidem', 2),
 ('reprehendo;', 1),
 ('Lorem', 1),
 ('adipiscing', 1),
 ('Quibus', 1),
 ('aliunde', 1),
 ('adhuc', 1),
 ('protulisti,', 1),
 ('popula

In [27]:
rddsi = rdds.filter(lambda x : x[0] == 'si')
rddsi.collect()

[('si', 4)]

In [33]:
new_rdd = rdd.flatMap(lambda x : x.split(" ")).map(lambda x : (x,1)).reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1],ascending=False)

In [36]:
new_rdd.take(3)

[('non', 11), ('esse', 8), ('quod', 7)]

In [37]:
new_rdd.count()

413

In [38]:
new_rdd.first()

('non', 11)

In [43]:
new_rdd.foreach(lambda x : print('word'+'='+str(x[0])+'count ='+ str(x[1])))

- reduce vs reducebykey
- reduce is action 
- reducebykey is transformation

In [45]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6])
sum = rdd.reduce(lambda x,y : x+y)
sum

21