In [1]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.251:7077") \
        .appName("Lecture1_Example4_wordcount_examples")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 16:34:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
book=spark_context.textFile("hdfs://192.168.2.251:9000/data/books/book-1.txt")
book.take(10)

["Harry Potter and the Sorcerer's Stone",
 '',
 '',
 'CHAPTER ONE',
 '',
 'THE BOY WHO LIVED',
 '',
 'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say',
 'that they were perfectly normal, thank you very much. They were the last',
 "people you'd expect to be involved in anything strange or mysterious,"]

In [5]:
# rdd.map(): Return a new RDD by applying a function to each element of this RDD.
## split each line into seperated words
book_sp=book.map(lambda x: x.split(" "))
book_sp.take(5)

[['Harry', 'Potter', 'and', 'the', "Sorcerer's", 'Stone'],
 [''],
 [''],
 ['CHAPTER', 'ONE'],
 ['']]

In [6]:
# rdd.filter(): Return a new RDD containing only the elements that satisfy a predicate.
## for instance we can filter out sentences with too short phrases, which might be useless for analysis.
book_sp_1=book_sp.filter(lambda x: len(x) > 1)
book_sp_1.take(5)

[['Harry', 'Potter', 'and', 'the', "Sorcerer's", 'Stone'],
 ['CHAPTER', 'ONE'],
 ['THE', 'BOY', 'WHO', 'LIVED'],
 ['Mr.',
  'and',
  'Mrs.',
  'Dursley,',
  'of',
  'number',
  'four,',
  'Privet',
  'Drive,',
  'were',
  'proud',
  'to',
  'say'],
 ['that',
  'they',
  'were',
  'perfectly',
  'normal,',
  'thank',
  'you',
  'very',
  'much.',
  'They',
  'were',
  'the',
  'last']]

In [7]:
# rdd.flatMap(): Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
## for example we can create single word RDD from previous result
book_sw=book_sp_1.flatMap(lambda x: x)
book_sw.take(20)

['Harry',
 'Potter',
 'and',
 'the',
 "Sorcerer's",
 'Stone',
 'CHAPTER',
 'ONE',
 'THE',
 'BOY',
 'WHO',
 'LIVED',
 'Mr.',
 'and',
 'Mrs.',
 'Dursley,',
 'of',
 'number',
 'four,',
 'Privet']

In [8]:
# rdd.groupBy(): Return an RDD of grouped items. Can be used to group the RDD elements by some condition.
## for example we group the words by their length.
book_sw_fl = book_sw.groupBy(lambda x: len(x))
book_sw_fl.take(2)

[(6, <pyspark.resultiterable.ResultIterable at 0x7f3a2c42e380>),
 (10, <pyspark.resultiterable.ResultIterable at 0x7f3a2c42e470>)]

In [9]:
book_sw_fl.mapValues(list).take(1)

[(6,
  ['Potter',
   'number',
   'Privet',
   'Drive,',
   'people',
   'expect',
   "didn't",
   'called',
   'hardly',
   'blonde',
   'nearly',
   'amount',
   'useful',
   'garden',
   'spying',
   'called',
   'Dudley',
   "didn't",
   'anyone',
   'Potter',
   "hadn't",
   'years;',
   "didn't",
   'sister',
   'reason',
   "didn't",
   'Dudley',
   'mixing',
   'cloudy',
   'things',
   'hummed',
   'picked',
   'boring',
   'Dudley',
   'chair.',
   'large,',
   'eight,',
   'picked',
   'pecked',
   'cheek,',
   'Dudley',
   'Dudley',
   'having',
   'cereal',
   'walls.',
   'tyke,"',
   'house.',
   'backed',
   'number',
   "four's",
   'drive.',
   'corner',
   'street',
   "didn't",
   'jerked',
   'around',
   'again.',
   'corner',
   'Privet',
   'Drive,',
   "wasn't",
   'sight.',
   'light.',
   'stared',
   'stared',
   'around',
   'corner',
   'Privet',
   'signs.',
   'little',
   'toward',
   'except',
   'drills',
   'hoping',
   'drills',
   'driven',
   'see

In [10]:
# rdd.groupByKey(): Group the values for each key in the RDD into a single sequence, can be used to group RDD by key of elements.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can first construct (word,1) key-value pair, and then group by key, which is the word:
book_sw_p = book_sw.map(lambda x: (x,1))
book_wk=book_sw_p.groupByKey()
book_wk.take(5)

[("Sorcerer's", <pyspark.resultiterable.ResultIterable at 0x7f3a2c20c160>),
 ('ONE', <pyspark.resultiterable.ResultIterable at 0x7f3a2c20c190>),
 ('BOY', <pyspark.resultiterable.ResultIterable at 0x7f3a2c20c1f0>),
 ('WHO', <pyspark.resultiterable.ResultIterable at 0x7f3a2c20c250>),
 ('Mr.', <pyspark.resultiterable.ResultIterable at 0x7f3a2c20c2b0>)]

In [11]:
# use .mapValues() to pass each value in the key-value pair through a map function
book_wk.mapValues(list).take(2)

[("Sorcerer's", [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 ('ONE', [1, 1])]

In [13]:
# rdd.reduceByKey(): Merge the values for each key using an associative and commutative reduce function.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can reduce the (word,1) key-value pair, and do wordcount:
from operator import add
book_wordcount = book_sw_p.reduceByKey(add)
book_wordcount.take(20)

[("Sorcerer's", 16),
 ('ONE', 2),
 ('BOY', 1),
 ('WHO', 1),
 ('Mr.', 79),
 ('of', 1235),
 ('four,', 3),
 ('Privet', 16),
 ('Drive,', 3),
 ('say', 52),
 ('perfectly', 5),
 ('normal,', 2),
 ('very', 160),
 ('much.', 9),
 ('They', 154),
 ('last', 61),
 ("you'd", 16),
 ('in', 898),
 ('anything', 47),
 ('strange', 16)]

In [18]:
# set hash seed to disable randomness
import os
os.environ["PYTHONHASHSEED"]=str(123)
# Frequency of the word "Sorcerer's"
book_wordcount.lookup("Sorcerer's")

[16]

In [19]:
book_wordcount.keys().take(20)

["Sorcerer's",
 'ONE',
 'BOY',
 'WHO',
 'Mr.',
 'of',
 'four,',
 'Privet',
 'Drive,',
 'say',
 'perfectly',
 'normal,',
 'very',
 'much.',
 'They',
 'last',
 "you'd",
 'in',
 'anything',
 'strange']

In [20]:
# rdd.distinct(): Return a new RDD containing the distinct elements in this RDD.
# check the length of list before/after distinct
print("Before .distinct():",book_sw.count())
print("After  .distinct():",book_sw.distinct().count())

Before .distinct(): 78161
After  .distinct(): 11779


In [21]:
# rdd.keyBy(): Creates tuples of the elements in this RDD by applying f.
## for example we can realize FirstLetterCount with this operation.
book_sw.keyBy(lambda x: x[0]).take(5)

[('H', 'Harry'),
 ('P', 'Potter'),
 ('a', 'and'),
 ('t', 'the'),
 ('S', "Sorcerer's")]

In [22]:
# Pipelined operation
sorted(                                  # sort the results by alphabet
    book.map(lambda x: x.split(" "))     # split each line into seperated words
    .filter(lambda x: len(x) > 0)        # filter out empty lines
    .flatMap(lambda x: x)                # flatMap to single words
    .filter(lambda x: len(x) > 0)        # filter out empty words
    .keyBy(lambda x: x[0].lower())       # extract the first letter and covert to lower case
    .map(lambda x: (x[0],1))             # create (first_letter, 1) pairs
    .reduceByKey(add)                    # reduce the key-value pair by adding up
    .collect()                           # collect the result
)

[('\t', 1),
 ('"', 2409),
 ("'", 53),
 ('(', 30),
 (',', 2),
 ('-', 837),
 ('.', 5),
 ('0', 2),
 ('1', 9),
 ('2', 2),
 ('3', 6),
 ('4', 3),
 ('9', 1),
 ('a', 7451),
 ('b', 3616),
 ('c', 2481),
 ('d', 2474),
 ('e', 1189),
 ('f', 2890),
 ('g', 1909),
 ('h', 8382),
 ('i', 3822),
 ('j', 329),
 ('k', 631),
 ('l', 2119),
 ('m', 2403),
 ('n', 1688),
 ('o', 3895),
 ('p', 2074),
 ('q', 358),
 ('r', 1684),
 ('s', 6247),
 ('t', 11247),
 ('u', 919),
 ('v', 476),
 ('w', 5306),
 ('y', 1484),
 ('z', 15)]

In [23]:
# define a function and use it in spark
def key_pair(x):
    return (x[0],x,1)
book_sw.map(key_pair).take(5)

[('H', 'Harry', 1),
 ('P', 'Potter', 1),
 ('a', 'and', 1),
 ('t', 'the', 1),
 ('S', "Sorcerer's", 1)]

In [24]:
spark_session.stop()