## MLSD: Spark Intro
#### Maria Rafaela Alves Abrunhosa 107658
**13th March 2025**

This exercise is an introduction to Spark and has three subtasks:
- count the occurrences of different words in a text file (lusiadas.txt)
- find the most common biowords (ignoring words with less than 3 letters)
- calculate the number of unique words that begin with each letter of the alphabet (also ignoring words with less than 3 letters).

In [6]:
# %pip install pyspark

# imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd

#### Exercise 2
Count the occurrences of different words in a text file (lusiadas.txt).

In [14]:
# create a spark session
spark = SparkSession.builder.appName("Count Distinct Words").getOrCreate()

# create a dataframe using lusiadas.txt
lusiadas = spark.read.text("lusiadas.txt").rdd.map(lambda row: row[0])

# clean text, split in words and convert to lower case
lusiadasWords = lusiadas.flatMap(lambda line: line.lower().split()) # flatMap necessary to split the spark lines in words

# remove punctionation
lusiadasWords = lusiadasWords.map(lambda word: ''.join(filter(str.isalpha, word))) # verify if the characters are alphabetic

# apply map reduce with spark
lusiadasCount = lusiadasWords.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) # create key-value pairs and sum the same words

# convert to df
lusiadasCountdf = lusiadasCount.toDF(["Word", "Count"])
lusiadasCountdf.head(20)

[Row(Word='luís', Count=2),
 Row(Word='vaz', Count=1),
 Row(Word='de', Count=1423),
 Row(Word='camões', Count=1),
 Row(Word='os', Count=716),
 Row(Word='lusíadas', Count=1),
 Row(Word='canto', Count=24),
 Row(Word='primeiro', Count=34),
 Row(Word='', Count=1102),
 Row(Word='as', Count=504),
 Row(Word='armas', Count=56),
 Row(Word='e', Count=2198),
 Row(Word='barões', Count=9),
 Row(Word='assinalados', Count=1),
 Row(Word='que', Count=2741),
 Row(Word='da', Count=487),
 Row(Word='ocidental', Count=1),
 Row(Word='praia', Count=15),
 Row(Word='lusitana', Count=18),
 Row(Word='por', Count=538)]

#### Exercise 3
Find the most common biowords (ignoring words with less than 3 letters).

In [19]:
# create a spark session
spark2 = SparkSession.builder.appName("Most Common biwords").getOrCreate()

# use lusiadasWords already cleaned from exercise 2
lusiadasWords2 = lusiadas.flatMap(lambda line: line.lower().split()) # flatMap necessary to split the spark lines in words
lusiadasWords2 = lusiadasWords2.map(lambda word: ''.join(filter(str.isalpha, word))) # verify if the characters are alphabetic
lusiadasWords2 = lusiadasWords2.filter(lambda word: len(word) >= 3) # ignore words with less than 3 letters

# get words with index
wordsIndex = lusiadasWords2.zipWithIndex().map(lambda wI: (wI[1], wI[0])) # create tuples like (index, word)
wordsIndex2 = wordsIndex.map(lambda newtup: (newtup[0] - 1, newtup[1])) # create a new tuples, modify the original rdd by reducing the index by 1
biwordsJoin = wordsIndex.join(wordsIndex2) # compare the two rdds and join the elements with the same key, the indexes, the current word with the previous word

biwords = biwordsJoin.map(lambda biword: (biword[1], 1)) # create key-value pairs with biwords
biwords = biwords.reduceByKey(lambda a, b: a + b) # sum the same biwords

# convert to df
lusiadasCountdf2 = biwords.toDF(["Biword", "Count"])
lusiadasCountdf2.head(20)

[Row(Biword=Row(_1='camões', _2='lusíadas'), Count=1),
 Row(Biword=Row(_1='lusitana', _2='por'), Count=2),
 Row(Biword=Row(_1='antes', _2='navegados'), Count=1),
 Row(Biword=Row(_1='passaram', _2='ainda'), Count=1),
 Row(Biword=Row(_1='perigos', _2='guerras'), Count=1),
 Row(Biword=Row(_1='esforçados', _2='mais'), Count=1),
 Row(Biword=Row(_1='que', _2='prometia'), Count=1),
 Row(Biword=Row(_1='sublimaram', _2='também'), Count=1),
 Row(Biword=Row(_1='memórias', _2='gloriosas'), Count=1),
 Row(Biword=Row(_1='que', _2='foram'), Count=3),
 Row(Biword=Row(_1='aqueles', _2='que'), Count=10),
 Row(Biword=Row(_1='valerosas', _2='vão'), Count=1),
 Row(Biword=Row(_1='libertando', _2='cantando'), Count=1),
 Row(Biword=Row(_1='tanto', _2='ajudar'), Count=1),
 Row(Biword=Row(_1='engenho', _2='arte'), Count=1),
 Row(Biword=Row(_1='navegações', _2='grandes'), Count=1),
 Row(Biword=Row(_1='que', _2='fizeram'), Count=3),
 Row(Biword=Row(_1='calese', _2='alexandro'), Count=1),
 Row(Biword=Row(_1='que',

#### Exercise 4
Calculate the number of unique words that begin with each letter of the alphabet (also ignoring words with less than 3 letters).

In [20]:
# create a spark session
spark3 = SparkSession.builder.appName("Unique Words - each letter of the alphabet").getOrCreate()

# use lusiadasWords already cleaned from exercise 2
lusiadasWords3 = lusiadas.flatMap(lambda line: line.lower().split()) # flatMap necessary to split the spark lines in words
lusiadasWords3 = lusiadasWords3.map(lambda word: ''.join(filter(str.isalpha, word))) # verify if the characters are alphabetic
lusiadasWords3 = lusiadasWords3.filter(lambda word: len(word) >= 3) # ignore words with less than 3 letters

# get first letter and map unique words
uniqueWordsLetter = lusiadasWords3.distinct().map(lambda word: (word[0], 1)) # create key-value pairs with unique words per letter
uniqueWordsLetterCount = uniqueWordsLetter.reduceByKey(lambda a, b: a + b) # count unique words per letter

# convert to df
lusiadasCountdf3 = uniqueWordsLetterCount.toDF(["Unique Words Letter Alphabet", "Count"])
lusiadasCountdf3.head(20)

25/03/14 12:25:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Row(Unique Word Letter Alphabet='l', Count=285),
 Row(Unique Word Letter Alphabet='v', Count=398),
 Row(Unique Word Letter Alphabet='c', Count=999),
 Row(Unique Word Letter Alphabet='p', Count=789),
 Row(Unique Word Letter Alphabet='a', Count=1072),
 Row(Unique Word Letter Alphabet='b', Count=227),
 Row(Unique Word Letter Alphabet='q', Count=93),
 Row(Unique Word Letter Alphabet='o', Count=199),
 Row(Unique Word Letter Alphabet='m', Count=516),
 Row(Unique Word Letter Alphabet='n', Count=223),
 Row(Unique Word Letter Alphabet='t', Count=477),
 Row(Unique Word Letter Alphabet='g', Count=232),
 Row(Unique Word Letter Alphabet='e', Count=679),
 Row(Unique Word Letter Alphabet='f', Count=454),
 Row(Unique Word Letter Alphabet='h', Count=145),
 Row(Unique Word Letter Alphabet='r', Count=414),
 Row(Unique Word Letter Alphabet='s', Count=599),
 Row(Unique Word Letter Alphabet='d', Count=700),
 Row(Unique Word Letter Alphabet='i', Count=370),
 Row(Unique Word Letter Alphabet='á', Count=22)]