# Section A: Spark RDD API

In [1]:
from pyspark.sql import SparkSession
from operator import add
import string
import re

In [2]:
spark_session = (
    SparkSession.builder.master("spark://localhost:7077")
    .appName("MJEnrico_A3_A")
    .config("spark.dynamicAllocation.enabled", True)
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s")
#     .config("spark.shuffle.service.enabled", True)
    .config("spark.cores.max", 4)
    .getOrCreate()
)

spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/06 08:26:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
rdd = spark_context.parallelize([1,2,3,4])

In [4]:
rdd.collect()

                                                                                

[1, 2, 3, 4]

### Question A.1

In [3]:
lines_de = spark_context.textFile("hdfs://192.168.2.70:9000/europarl/*de-en.de")
lines_en = spark_context.textFile("hdfs://192.168.2.70:9000/europarl/*de-en.en")

n_lines_de = lines_de.count()
n_lines_en = lines_en.count()

print(f"Number of german lines  : {n_lines_de}")
print(f"Number of english lines : {n_lines_en}")
print(f"Num partition german  : {lines_de.getNumPartitions()}")
print(f"Num partition english : {lines_en.getNumPartitions()}")



Number of german lines  : 1920209
Number of english lines : 1920209
Num partition german  : 3
Num partition english : 3


                                                                                

### Question A.2

In [4]:
def text_preprocess(sentence):
    return sentence.lower().split(" ")

In [5]:
print(lines_en.take(3))
en_token = lines_en.map(text_preprocess)
print(en_token.take(3))
# print(en_token.map(lambda x: 1).reduce(add))  # to confirm number of lines

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."]


[Stage 3:>                                                          (0 + 1) / 1]

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.']]


                                                                                

In [6]:
print(f"Num of lines after tokenization: {en_token.count()}")



Num of lines after tokenization: 1920209


                                                                                

### Question A.3

In [7]:
en_word_count = (
    en_token.flatMap(lambda x: x)
    .map(lambda x: (x, 1))
    .reduceByKey(add)
    .sortBy(lambda x: x[1], ascending=False)
)
en_word_count.take(10)

                                                                                

[('the', 3663118),
 ('of', 1736975),
 ('to', 1611788),
 ('and', 1345072),
 ('in', 1134025),
 ('that', 835871),
 ('a', 810540),
 ('is', 792564),
 ('for', 557349),
 ('we', 551243)]

### Question A.4

In [8]:
def sentence_preprocessing(tup):
    ID = tup[1]
    text = re.sub(f"[{string.punctuation}]+", " ", tup[0])
    text = re.sub(f"[{string.whitespace}]+", " ", text)
    text = text.lower().strip()
    return (ID, text)

def pair_filter(tup):
    len1, len2 = list(map(lambda x: len(x.split()), tup[1]))
    if 0 < len1 < 7 and 0 < len2 < 7 and len1 == len2:
        return True
    return False

def pair_mapper(tup):
    s1, s2 = tup[1]
    s1 = s1.split()
    s2 = s2.split()
    return zip(s1, s2)

In [9]:
english_rdd = lines_en.zipWithIndex().map(sentence_preprocessing)
german_rdd = lines_de.zipWithIndex().map(sentence_preprocessing)

join_rdd = (
    english_rdd.join(german_rdd)
    .filter(pair_filter)
    .flatMap(pair_mapper)
    .map(lambda x: (x, 1))
    .reduceByKey(add)
    .sortBy(lambda x: x[1], ascending=False)
)

                                                                                

In [10]:
join_rdd.take(50)

[(('is', 'ist'), 9132),
 (('the', 'die'), 6312),
 (('debate', 'aussprache'), 4562),
 (('closed', 'geschlossen'), 4362),
 (('applause', 'beifall'), 3589),
 (('that', 'das'), 2161),
 (('we', 'wir'), 2041),
 (('the', 'der'), 1828),
 (('i', 'ich'), 1618),
 (('you', 'dank'), 1480),
 (('thank', 'vielen'), 1468),
 (('this', 'das'), 1281),
 (('vote', 'abstimmung'), 1266),
 (('not', 'nicht'), 1141),
 (('written', 'schriftliche'), 1048),
 (('mr', 'herr'), 1015),
 (('rule', 'artikel'), 874),
 (('are', 'sind'), 843),
 (('statements', 'erklärungen'), 820),
 (('a', 'ein'), 721),
 (('and', 'und'), 696),
 (('before', 'vor'), 691),
 (('a', 'eine'), 665),
 (('minutes', 'protokoll'), 641),
 (('see', 'siehe'), 632),
 (('the', 'das'), 576),
 (('what', 'was'), 558),
 (('this', 'dies'), 557),
 (('why', 'warum'), 551),
 (('report', 'bericht'), 542),
 (('it', 'es'), 528),
 (('a5', 'a5'), 498),
 (('have', 'haben'), 464),
 (('142', '142'), 453),
 (('you', 'sie'), 450),
 (('it', 'das'), 447),
 (('for', 'für'), 44

### My comment: 

The paradigm performs reasonably well for a simple translaton method like this. Only few words are wrongly translated such as "you" being translated to "dank". However, the downside of using this technique is its inability to translate less frequently mentioned words

In [11]:
spark_context.stop()