## Initialize Spark

In [1]:
import pyspark
#changed

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('pyspark-training').master('local[2]').getOrCreate()

In [3]:
spark.version

'3.2.0'

In [4]:
spark.getActiveSession()

In [5]:
spark.sparkContext.applicationId

'local-1641094265884'

In [6]:
sc = spark.sparkContext

In [8]:
sc.startTime

1641094263342

In [9]:
sc.setLogLevel('WARN')

In [10]:
sc.uiWebUrl

'http://Mohit.mshome.net:4040'

### Create an rdd using range

In [11]:
list(range(11,20,2))

[11, 13, 15, 17, 19]

In [12]:
rdd_range = sc.range(20)

In [13]:
sc.range(10,20,3).collect()

[10, 13, 16, 19]

In [14]:
rdd_range.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

# creating an rdd from a list

In [15]:
emp_detail = [('Adam', 26), ('Jack', 25), ('Rachel', 29)]

In [16]:
rdd_emp = sc.parallelize(emp_detail)

In [17]:
type(rdd_emp)

pyspark.rdd.RDD

# count the number of rows

In [18]:
rdd_emp.count()

3

In [19]:
emp_count = rdd_emp.count()

In [20]:
emp_count

3

# fetch the first row

In [21]:
emp_first = rdd_emp.first()

In [22]:
emp_first

('Adam', 26)

In [23]:
emp_first[0]

'Adam'

# collect the rdd  - collect should be avoided in the case of huge dataset.

In [24]:
rdd_emp.collect()

[('Adam', 26), ('Jack', 25), ('Rachel', 29)]

In [25]:
for i in rdd_emp.collect():
    print(i[0] + '-' + str(i[1]))

Adam-26
Jack-25
Rachel-29


In [26]:
name = [i[0] for i in rdd_emp.collect()]

In [27]:
name

['Adam', 'Jack', 'Rachel']

# get n rows

In [28]:
rdd_emp.take(2)

[('Adam', 26), ('Jack', 25)]

# word count

In [29]:
text = 'Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming'

In [30]:
text

'Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming'

In [31]:
rdd = sc.parallelize([text])

In [32]:
rdd.collect()

['Apache Spark is a fast and general-purpose cluster computing system. It provides high-level APIs in Java, Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set of higher-level tools including Spark SQL for SQL and structured data processing, MLlib for machine learning, GraphX for graph processing, and Spark Streaming']

In [33]:
def row_split(x):
    return x.split(' ')

In [34]:
rdd.map(lambda x : x.split(' ')).collect()

[['Apache',
  'Spark',
  'is',
  'a',
  'fast',
  'and',
  'general-purpose',
  'cluster',
  'computing',
  'system.',
  'It',
  'provides',
  'high-level',
  'APIs',
  'in',
  'Java,',
  'Scala,',
  'Python',
  'and',
  'R,',
  'and',
  'an',
  'optimized',
  'engine',
  'that',
  'supports',
  'general',
  'execution',
  'graphs.',
  'It',
  'also',
  'supports',
  'a',
  'rich',
  'set',
  'of',
  'higher-level',
  'tools',
  'including',
  'Spark',
  'SQL',
  'for',
  'SQL',
  'and',
  'structured',
  'data',
  'processing,',
  'MLlib',
  'for',
  'machine',
  'learning,',
  'GraphX',
  'for',
  'graph',
  'processing,',
  'and',
  'Spark',
  'Streaming']]

In [35]:
flatten_rdd = rdd.map(lambda x : x.split(' ')).flatMap(lambda x : x)

In [36]:
flatten_rdd.collect()

['Apache',
 'Spark',
 'is',
 'a',
 'fast',
 'and',
 'general-purpose',
 'cluster',
 'computing',
 'system.',
 'It',
 'provides',
 'high-level',
 'APIs',
 'in',
 'Java,',
 'Scala,',
 'Python',
 'and',
 'R,',
 'and',
 'an',
 'optimized',
 'engine',
 'that',
 'supports',
 'general',
 'execution',
 'graphs.',
 'It',
 'also',
 'supports',
 'a',
 'rich',
 'set',
 'of',
 'higher-level',
 'tools',
 'including',
 'Spark',
 'SQL',
 'for',
 'SQL',
 'and',
 'structured',
 'data',
 'processing,',
 'MLlib',
 'for',
 'machine',
 'learning,',
 'GraphX',
 'for',
 'graph',
 'processing,',
 'and',
 'Spark',
 'Streaming']

### Assign 1 to each word and use reduceByKey to sum each occurance of a word

In [37]:
#paired rdd -> rdd of key value

In [38]:
flatten_rdd.map(lambda x : (x, 1)).collect()

[('Apache', 1),
 ('Spark', 1),
 ('is', 1),
 ('a', 1),
 ('fast', 1),
 ('and', 1),
 ('general-purpose', 1),
 ('cluster', 1),
 ('computing', 1),
 ('system.', 1),
 ('It', 1),
 ('provides', 1),
 ('high-level', 1),
 ('APIs', 1),
 ('in', 1),
 ('Java,', 1),
 ('Scala,', 1),
 ('Python', 1),
 ('and', 1),
 ('R,', 1),
 ('and', 1),
 ('an', 1),
 ('optimized', 1),
 ('engine', 1),
 ('that', 1),
 ('supports', 1),
 ('general', 1),
 ('execution', 1),
 ('graphs.', 1),
 ('It', 1),
 ('also', 1),
 ('supports', 1),
 ('a', 1),
 ('rich', 1),
 ('set', 1),
 ('of', 1),
 ('higher-level', 1),
 ('tools', 1),
 ('including', 1),
 ('Spark', 1),
 ('SQL', 1),
 ('for', 1),
 ('SQL', 1),
 ('and', 1),
 ('structured', 1),
 ('data', 1),
 ('processing,', 1),
 ('MLlib', 1),
 ('for', 1),
 ('machine', 1),
 ('learning,', 1),
 ('GraphX', 1),
 ('for', 1),
 ('graph', 1),
 ('processing,', 1),
 ('and', 1),
 ('Spark', 1),
 ('Streaming', 1)]

In [39]:
flatten_rdd.map(lambda x : (x, 1)).reduceByKey(lambda x,y : x+y).collect()

[('Apache', 1),
 ('Spark', 3),
 ('is', 1),
 ('general-purpose', 1),
 ('It', 2),
 ('provides', 1),
 ('high-level', 1),
 ('APIs', 1),
 ('in', 1),
 ('Java,', 1),
 ('Scala,', 1),
 ('Python', 1),
 ('an', 1),
 ('optimized', 1),
 ('engine', 1),
 ('supports', 2),
 ('execution', 1),
 ('set', 1),
 ('of', 1),
 ('tools', 1),
 ('SQL', 2),
 ('processing,', 2),
 ('MLlib', 1),
 ('machine', 1),
 ('learning,', 1),
 ('GraphX', 1),
 ('graph', 1),
 ('a', 2),
 ('fast', 1),
 ('and', 5),
 ('cluster', 1),
 ('computing', 1),
 ('system.', 1),
 ('R,', 1),
 ('that', 1),
 ('general', 1),
 ('graphs.', 1),
 ('also', 1),
 ('rich', 1),
 ('higher-level', 1),
 ('including', 1),
 ('for', 3),
 ('structured', 1),
 ('data', 1),
 ('Streaming', 1)]

### kaggle sofia air quality dataset - https://www.kaggle.com/hmavrodiev/sofia-air-quality-dataset
### beta-Mercaptoethanol (bme) bme sensors for measuring pressure,temperature,humidity
### reading a text file
### row_id,sensor_id,location,lat,lon,timestamp,pressure,temperature,humidity

In [40]:
rdd_bme = sc.textFile('D:\\data\\sofia\\bme\\2017-07_bme280sof.csv')

In [41]:
type(rdd_bme)

pyspark.rdd.RDD

In [45]:
rdd_bme.take(5)

['id,sensor_id,location,lat,lon,timestamp,pressure,temperature,humidity',
 '1,2266,1140,42.738,23.272,2017-07-01T00:00:07,95270.27,23.46,62.48',
 '5,2292,1154,42.663000000000004,23.273000000000003,2017-07-01T00:00:08,94355.83,23.06,59.46',
 '7,3096,1558,42.7,23.36,2017-07-01T00:00:10,95155.81,26.53,44.38',
 '9,3428,1727,42.623999999999995,23.406,2017-07-01T00:00:12,94679.57,28.34,38.28']

### Remove header

In [46]:
header = rdd_bme.first()

In [47]:
header

'id,sensor_id,location,lat,lon,timestamp,pressure,temperature,humidity'

In [48]:
rdd_bme = rdd_bme.filter(lambda line: line != header)

In [49]:
rdd_bme.take(5)

['1,2266,1140,42.738,23.272,2017-07-01T00:00:07,95270.27,23.46,62.48',
 '5,2292,1154,42.663000000000004,23.273000000000003,2017-07-01T00:00:08,94355.83,23.06,59.46',
 '7,3096,1558,42.7,23.36,2017-07-01T00:00:10,95155.81,26.53,44.38',
 '9,3428,1727,42.623999999999995,23.406,2017-07-01T00:00:12,94679.57,28.34,38.28',
 '10,3472,1750,42.669,23.318,2017-07-01T00:00:13,94327.88,26.31,46.37']

In [50]:
# count the number of rows
rdd_bme.count()

701548

In [51]:
rdd_bme.take(1)

['1,2266,1140,42.738,23.272,2017-07-01T00:00:07,95270.27,23.46,62.48']

### split the file by ','

In [52]:
rdd_bme_split = rdd_bme.map(lambda row : row.split(','))

### count the number of sensors

In [53]:
rdd_bme_split.map(lambda row : row[1]).distinct().take(5)

['2266', '3096', '3428', '3472', '1846']

In [54]:
rdd_bme_split.map(lambda row : row[1]).distinct().count()

56

In [171]:
#rdd_bme_split.map(lambda row : row[1]).distinct().collect()

### collect the sensors to a list 

In [55]:
sensors = rdd_bme.map(lambda row : row.split(',')[1]).distinct().collect()

In [56]:
sensors[0:10]
# [u'788', u'6385', u'5472', u'12077', u'10075', u'20111', u'6433', u'5564', u'6419', u'6680']

['2266',
 '3096',
 '3428',
 '3472',
 '1846',
 '2228',
 '1954',
 '3620',
 '3436',
 '3092']

# number of records in sensor 2266

In [57]:
rdd_bme_split.filter(lambda row : row[1] == '2266').count()

17708

In [58]:
rdd_bme_split.filter(lambda row : row[1] == '2266').take(5)

[['1',
  '2266',
  '1140',
  '42.738',
  '23.272',
  '2017-07-01T00:00:07',
  '95270.27',
  '23.46',
  '62.48'],
 ['102',
  '2266',
  '1140',
  '42.738',
  '23.272',
  '2017-07-01T00:02:33',
  '95266.66',
  '23.37',
  '63.4'],
 ['201',
  '2266',
  '1140',
  '42.738',
  '23.272',
  '2017-07-01T00:05:00',
  '95258.14',
  '23.37',
  '63.22'],
 ['302',
  '2266',
  '1140',
  '42.738',
  '23.272',
  '2017-07-01T00:07:26',
  '95267.81',
  '23.23',
  '63.07'],
 ['400',
  '2266',
  '1140',
  '42.738',
  '23.272',
  '2017-07-01T00:09:53',
  '95267.84',
  '23.25',
  '63.37']]

### paired rdd (rdd in key/value pair)

In [59]:
rdd_bme_paired = rdd_bme.map(lambda row : row.split(',')).map(lambda row : (row[1], row))

In [60]:
rdd_bme_paired.take(1)

[('2266',
  ['1',
   '2266',
   '1140',
   '42.738',
   '23.272',
   '2017-07-01T00:00:07',
   '95270.27',
   '23.46',
   '62.48'])]

### count the number of records for each sensor

In [61]:
lst = [1,1,1,2,3,3,4,5,6,5,4,3]
dct = {}
for l in lst:
    if l in dct.keys():
        dct[l]+=1
    else:
        dct[l] = 1

In [62]:
dct

{1: 3, 2: 1, 3: 3, 4: 2, 5: 2, 6: 1}

In [63]:
rdd_bme_paired.countByKey()

defaultdict(int,
            {'2266': 17708,
             '2292': 11003,
             '3096': 17201,
             '3428': 15258,
             '3472': 17931,
             '1952': 15383,
             '1846': 18184,
             '3512': 17070,
             '2228': 17893,
             '3438': 14333,
             '1954': 18201,
             '3620': 15696,
             '3436': 13028,
             '3092': 18152,
             '2036': 17982,
             '1962': 18216,
             '3474': 17941,
             '2232': 16145,
             '2607': 18156,
             '2224': 18025,
             '3738': 18132,
             '3102': 17845,
             '2040': 9762,
             '2216': 11661,
             '3432': 18084,
             '2294': 18079,
             '2230': 17617,
             '2264': 18208,
             '1850': 17703,
             '2234': 18008,
             '3558': 6998,
             '2262': 18184,
             '1764': 18198,
             '3836': 18108,
             '2038': 18053,
     

### union

In [122]:
rdd1 =sc.parallelize([('Phoebe', 24), ('Monica', 25), ('Rachel', 26)])
rdd2 = sc.parallelize([('Joey', 27), ('Ross', 28), ('Chandler', 29)])

In [123]:
rdd_union = rdd1.union(rdd2)

In [124]:
rdd_union.collect()

[('Phoebe', 24),
 ('Monica', 25),
 ('Rachel', 26),
 ('Joey', 27),
 ('Ross', 28),
 ('Chandler', 29)]

### join

In [76]:
rdd_emp = sc.parallelize([(1, 'Williams'), (2, 'Mark')])
rdd_dept = sc.parallelize([(1, 'HR'), (2, 'IT')])

In [78]:
rdd_emp.join(rdd_dept).collect()

[(1, ('Williams', 'HR')), (2, ('Mark', 'IT'))]

### cartesian

In [125]:
rdd_orders = sc.parallelize([('o1'), ('o2')])
rdd_products =sc.parallelize([('facewash'), ('handwash'), ('bodywash')])

In [126]:
rdd_orders.cartesian(rdd_products).collect()

[('o1', 'facewash'),
 ('o1', 'handwash'),
 ('o1', 'bodywash'),
 ('o2', 'facewash'),
 ('o2', 'handwash'),
 ('o2', 'bodywash')]

### Persist

In [130]:
rdd_bme.persist(pyspark.StorageLevel.MEMORY_AND_DISK_2)

PythonRDD[80] at RDD at PythonRDD.scala:53

In [131]:
rdd_bme.count()

701548

In [132]:
rdd_bme.count()

701548

In [133]:
rdd_bme.unpersist()

PythonRDD[80] at RDD at PythonRDD.scala:53

### reduceByKey
### find the average pressure per sensor per day
### removing the rows with columns less than 9
### removing the rows with pressure value not defined
### creating key value pair with key as sensor_id and timestamp(date part), and value as pressure and 1

In [134]:
rdd_bme_mapped = rdd_bme_split.filter(lambda row : len(row) == 9)\
                              .filter(lambda row : row[6] != '')\
                              .map(lambda row : ((row[1], row[5][0:10]), (float(row[6]), 1)))

In [136]:
rdd_bme_mapped.take(10)

[(('2266', '2017-07-01'), (95270.27, 1)),
 (('2292', '2017-07-01'), (94355.83, 1)),
 (('3096', '2017-07-01'), (95155.81, 1)),
 (('3428', '2017-07-01'), (94679.57, 1)),
 (('3472', '2017-07-01'), (94327.88, 1)),
 (('1952', '2017-07-01'), (95314.52, 1)),
 (('1846', '2017-07-01'), (93616.77, 1)),
 (('3512', '2017-07-01'), (94962.39, 1)),
 (('2228', '2017-07-01'), (94982.91, 1)),
 (('3438', '2017-07-01'), (95099.81, 1))]

In [138]:
rdd_bme_mapped.reduceByKey(lambda x,y : (x[0]+y[0], x[1]+y[1])).take(5)

[(('2292', '2017-07-01'), (54339274.579999946, 577)),
 (('1952', '2017-07-01'), (51929139.60000006, 546)),
 (('3512', '2017-07-01'), (54215394.09000003, 572)),
 (('3438', '2017-07-01'), (53622333.59000005, 565)),
 (('3474', '2017-07-01'), (54400704.23999998, 576))]

In [102]:
rdd_bme_mapped.reduceByKey(lambda x,y : (x[0]+y[0], x[1]+y[1]))\
              .map(lambda x : (x[0], x[1][0]/ x[1][1])).take(10)

[(('2292', '2017-07-01'), 94175.51920277288),
 (('1952', '2017-07-01'), 95108.3142857144),
 (('3512', '2017-07-01'), 94782.15750000006),
 (('3438', '2017-07-01'), 94906.78511504433),
 (('3474', '2017-07-01'), 94445.6670833333),
 (('2232', '2017-07-01'), 94495.66624548737),
 (('3738', '2017-07-01'), 95039.42874564463),
 (('2040', '2017-07-01'), 94750.76236614858),
 (('3432', '2017-07-01'), 95144.56160558463),
 (('2294', '2017-07-01'), 92483.59339754815)]

### aggregateByKey
### find the max temperature for every date
### seqOp is defined which finds the max value in each partition for each key
### combOp is defined which finds the max value across partition for each key
### zeroVal is defined to store the initial value 

In [140]:
rdd_bme_mapped_temp = rdd_bme_split.filter(lambda row : len(row) == 9)\
                      .filter(lambda row : row[7] != '')\
                      .map(lambda row : ((row[1], row[5][0:10]), float(row[7])))

In [153]:
zeroVal = 0

def seqOp(acc, element):
    if acc > element :
        return acc
    else : 
        return element

def combOp(acc1, acc2):
    if acc1 > acc2:
        return acc1
    else:
        return acc2

In [154]:
rdd_bme_mapped_temp.aggregateByKey(zeroVal, seqOp, combOp).take(5)

[(('2292', '2017-07-01'), 43.38),
 (('1952', '2017-07-01'), 45.6),
 (('3512', '2017-07-01'), 52.07),
 (('3438', '2017-07-01'), 48.96),
 (('3474', '2017-07-01'), 41.09)]

### combineByKey
### find the min humidity for every date
### create combiner is similar to zeroVal in aggregateByKey. It initializes value for each key
### mergeValue does the local aggregations
### mergeCombiner does the aggregations for the keys across partitions

In [155]:
rdd_bme_mapped_hum = rdd_bme_split.filter(lambda row : len(row) == 9)\
                     .filter(lambda row : row[8] != '')\
                     .map(lambda row : ((row[1], row[5][0:10]), float(row[8])))

In [163]:
def createCombiner(humidity):
    return humidity

def mergeValue(acc, element):
    if acc < element:
        return acc
    else:
        return element

def mergeCombiner(acc1, acc2):
    if acc1 < acc2:
        return acc1
    else:
        return acc2


In [164]:
rdd_bme_mapped_hum.combineByKey(createCombiner, mergeValue, mergeCombiner).take(10)

[(('2292', '2017-07-01'), 14.14),
 (('1952', '2017-07-01'), 15.93),
 (('3512', '2017-07-01'), 8.85),
 (('3438', '2017-07-01'), 8.76),
 (('3474', '2017-07-01'), 15.14),
 (('2232', '2017-07-01'), 14.9),
 (('3738', '2017-07-01'), 3.59),
 (('2040', '2017-07-01'), 14.8),
 (('3432', '2017-07-01'), 10.09),
 (('2294', '2017-07-01'), 11.5)]

### broadcast

In [165]:
b = sc.broadcast([1, 2, 3, 4, 5])
b.value

[1, 2, 3, 4, 5]

In [166]:
sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect()

[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]

In [167]:
b.unpersist()

### repartition

In [168]:
rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
sorted(rdd.glom().collect())

[[1], [2, 3], [4, 5], [6, 7]]

In [169]:
rdd.repartition(2).glom().collect()

[[1, 4, 5, 6, 7], [2, 3]]

In [170]:
len(rdd.repartition(2).glom().collect())

2

In [171]:
len(rdd.repartition(10).glom().collect())

10

In [176]:
rdd_bme2 = rdd_bme.repartition(1000)

In [177]:
rdd_bme2.glom().collect()

KeyboardInterrupt: 