In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setMaster('spark://master:7077').setAppName('WordCount')
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/06 09:50:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc.setLogLevel('ERROR')
sc.version
sc.getConf().getAll()

[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.app.startTime', '1667728247021'),
 ('spark.master', 'spark://master:7077'),
 ('spark.driver.host', 'master'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.port', '41137'),
 ('spark.rdd.compress', 'True'),
 ('spark.execu

1. Count the number of times a word appears in the file  
First, we remove all the special characters with regex library, remove trailing spaces and lower case all characters

In [3]:
import re
import math
import time 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def removePunctuation(text):
    # Convert all text to lower case, remove any punctuation, remove leading and trailing spaces
    return (
        re
        .sub(r'[^A-Za-z0-9 ]', '', text)
        .strip()
        .lower()
    )

stopwordsList = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vagrant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Then we load the text file into RDD  
Apply flatMap to map each line to a list of words and remove all the stop words
Map each word into a tuple with the key is a word and value 1  
Use ReduceByKey to sum all the value with the same key  

In [4]:
wordCountRDD = (
    sc
#     .textFile('data/1B.language.model.test.txt', 8)
    .textFile('data/1B.language.model.txt', 8)
    .map(removePunctuation)
    .flatMap(lambda line: line.split())
    .filter(lambda word: word not in stopwordsList)
    .map(lambda x: (x, 1))
    .reduceByKey(lambda a, b: a + b)
)

In [5]:
# Print result
start_time = time.time()
print(wordCountRDD.takeOrdered(15, lambda x: -x[1]))
print("--- %s seconds ---" % (time.time() - start_time))

                                                                                

[('said', 4308248), ('would', 1531729), ('new', 1488263), ('one', 1462455), ('us', 1238052), ('also', 1223178), ('year', 1139189), ('two', 1119560), ('people', 1100102), ('last', 1077763), ('first', 1034435), ('mr', 904485), ('years', 873666), ('could', 868860), ('time', 852404)]
--- 1312.7914786338806 seconds ---


2. Calculate the most common words in the file  
Apply max method to the RDD to get the tuple with maximum value

In [6]:
start_time = time.time()
print(wordCountRDD.max(key=lambda x:x[1]))
print("--- %s seconds ---" % (time.time() - start_time))



('said', 4308248)
--- 14.639453887939453 seconds ---


                                                                                

3. Calculate the average number of appearances of all words.  
First we calculate the total number of apprearances of each word by summing all values from all tuples
Then we divide it to the total number of words

In [7]:
start_time = time.time()
print(
    wordCountRDD
    .map(lambda x: x[1])
    .reduce(lambda a, b: a + b) 
    / 
    wordCountRDD
    .count()
)
end_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))



204.13156518735002
--- 25.568430423736572 seconds ---


                                                                                

4. Calculate the frequency of combination of two words appears in the text file.  
The first step is also apply the removePunctuation filtering.  
Then we use flatMap to map each line into list of string, each string includes two consecutive words seperate with a space, and map the result into tuple of value 1  
Apply reduceByKey to sum up the values

In [8]:
twoWordCountRDD = (
    sc
#     .textFile('data/1B.language.model.test.txt', 8)
    .textFile('data/1B.language.model.txt', 8)
    .map(removePunctuation)
    .flatMap(lambda line: list(map(' '.join, zip( line.split()[:-1],  line.split()[1:]))))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda a, b: a + b)
)

In [9]:
start_time = time.time()
print(
    twoWordCountRDD
#     .collect()
    .take(100)
)
print("--- %s seconds ---" % (time.time() - start_time))

[Stage 9:>                                                          (0 + 1) / 1]

[('force was', 1850), ('fierce rearguard', 6), ('loser doesn', 7), ('cent blaming', 9), ('for interesting', 123), ('the superhero', 294), ('dubai world', 3113), ('left by', 3539), ('egg morning', 1), ('the sector', 8693), ('apartment will', 38), ('eruption of', 636), ('and within', 3020), ('the emea', 122), ('case the', 9009), ('major parties', 892), ('entertainment tonight', 533), ('streets of', 11202), ('with posters', 262), ('the target', 10397), ('of this', 136235), ('a car', 34886), ('conditions and', 6810), ('that doesn', 10807), ('match for', 2579), ('without sacrificing', 443), ('pro seasons', 32), ('his staff', 4678), ('polls and', 2176), ('bbc the', 1169), ('you unnecessarily', 1), ('opportunities to', 5086), ('missile defense', 5305), ('the brink', 7562), ('college though', 27), ('lawsuit was', 1149), ('months or', 2695), ('georgian and', 265), ('smog the', 16), ('a vacancy', 513), ('say the', 55571), ('her partner', 2401), ('exploration agency', 218), ('this phenomenon', 66

                                                                                

5. Print the two combined words that are the median in term of number of the appearances.  
First we map each tuple from the word count RDD above into tuple with the number of appearances as the key and the string as value  
ReduceKey to concat all the string having the same number of appearances. Then we sort them by key
We use zipWithIndex to map each tuple with the correct index in the sorted result
Finaly we can take the middle element from the above RDD result

In [10]:
start_time = time.time()
twoWordByAppearanceRDD = (
    twoWordCountRDD
    .map(lambda x: (x[1], x[0]))
    .reduceByKey(lambda a, b: a + b)
    .sortBy(lambda x: x[0])
)
median = math.floor(twoWordByAppearanceRDD.count() / 2)
print(
    twoWordByAppearanceRDD
    .zipWithIndex()
    .filter(lambda x: x[1] == median)
    .top(1)
)
print("--- %s seconds ---" % (time.time() - start_time))



[((7070, 'a model'), 6768)]
--- 6431.71009349823 seconds ---


                                                                                

The result is 'a model' which appear at 7070 times