In [2]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("AlmaLundbergSparkapplicationA3_A")\
        .config("spark.dynamicAllocation.enabled", False)\
        .config("spark.cores.max", 4)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API 
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 22:14:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Loading the text file from HDFS 
lines_en = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en")
num_lines_en = lines_en.count()
print(f"Total lines in English file: {num_lines_en}") 

lines_sv = spark_context.textFile("hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.sv")
num_lines_sv = lines_sv.count()
print(f"Total lines in Swedish file: {num_lines_sv}")

                                                                                

Total lines in English file: 1862234




Total lines in Swedish file: 1862234


                                                                                

In [3]:
# Counting nmr of partitions in the English file
num_partitions_en = lines_en.getNumPartitions()
print(f"Number of partitions in English file: {num_partitions_en}")

# Counting nmr of partitions in the Swedish file
num_partitions_sv = lines_sv.getNumPartitions()
print(f"Number of partitions in Swedish file: {num_partitions_sv}")

Number of partitions in English file: 2
Number of partitions in Swedish file: 3


In [4]:
def preprocess_text(line):
    return line.lower().split(' ')

In [5]:
processed_lines_en = lines_en.map(preprocess_text)
processed_lines_sv = lines_sv.map(preprocess_text)

In [6]:
# Verifying that the line counts still match after the pre-processing:

processed_count_en = processed_lines_en.count()
processed_count_sv = processed_lines_sv.count()
print(f"Processed line count in English: {processed_count_en}")
print(f"Processed line count in Swedish: {processed_count_sv}")



Processed line count in English: 1862234
Processed line count in Swedish: 1862234


                                                                                

In [82]:
# The line count still match after pre-processing

In [7]:
# Check 10 most frequent words in English and Swedish

words_en = processed_lines_en.flatMap(lambda line: line)
words_sv = processed_lines_sv.flatMap(lambda line: line)

word_counts_en = words_en.map(lambda word: (word, 1)).reduceByKey(add)
word_counts_sv = words_sv.map(lambda word: (word, 1)).reduceByKey(add)

top_10_words_en = word_counts_en.takeOrdered(10, key=lambda x: -x[1])
top_10_words_sv = word_counts_sv.takeOrdered(10, key=lambda x: -x[1])

print("\nTop 10 words in English:")
for word, count in top_10_words_en:
    print(f'{word}: {count}')
    
print("\nTop 10 words in Swedish:")
for word, count in top_10_words_sv:
    print(f'{word}: {count}')




Top 10 words in English:
the: 3498375
of: 1659758
to: 1539760
and: 1288401
in: 1085993
that: 797516
a: 773522
is: 758050
for: 534242
we: 522849

Top 10 words in Swedish:
att: 1706293
och: 1344830
i: 1050774
det: 924866
som: 913276
för: 908680
av: 738068
är: 694381
en: 620310
vi: 539797


                                                                                

In [84]:
# The results are reasonable, as the most frequent words are common words used to build up sentences (konjunctions, prepositions etc)
# In the Swedish translation for example, the words also match most of the top 10 most frequent words in the Swedish language like i, och, att, det, som, en, är, av, för 

In [8]:
# Question A4

# Assign a unique index to each line in both RDDs and swap so that the line number is the key
sv_swapped = processed_lines_sv.zipWithIndex().map(lambda x: (x[1], x[0]))
en_swapped = processed_lines_en.zipWithIndex().map(lambda x: (x[1], x[0]))

                                                                                

In [9]:
# See how it looks, so everything was done correctly
sv_swapped.take(10)

[(0, ['återupptagande', 'av', 'sessionen']),
 (1,
  ['jag',
   'förklarar',
   'europaparlamentets',
   'session',
   'återupptagen',
   'efter',
   'avbrottet',
   'den',
   '17',
   'december.',
   'jag',
   'vill',
   'på',
   'nytt',
   'önska',
   'er',
   'ett',
   'gott',
   'nytt',
   'år',
   'och',
   'jag',
   'hoppas',
   'att',
   'ni',
   'haft',
   'en',
   'trevlig',
   'semester.']),
 (2,
  ['som',
   'ni',
   'kunnat',
   'konstatera',
   'ägde',
   '"den',
   'stora',
   'år',
   '2000-buggen"',
   'aldrig',
   'rum.',
   'däremot',
   'har',
   'invånarna',
   'i',
   'ett',
   'antal',
   'av',
   'våra',
   'medlemsländer',
   'drabbats',
   'av',
   'naturkatastrofer',
   'som',
   'verkligen',
   'varit',
   'förskräckliga.']),
 (3,
  ['ni',
   'har',
   'begärt',
   'en',
   'debatt',
   'i',
   'ämnet',
   'under',
   'sammanträdesperiodens',
   'kommande',
   'dagar.']),
 (4,
  ['till',
   'dess',
   'vill',
   'jag',
   'att',
   'vi,',
   'som',
   'ett',
 

In [10]:
# See how it looks, so everything was done correctly
en_swapped.take(10)

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.']),
 (2,
  ['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.']),
 (3,
  ['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course

In [11]:
# Join the two RDDs together on the line number key
sv_en_joined_rdd = sv_swapped.join(en_swapped)

In [12]:
# See how it looks
sv_en_joined_rdd.take(10) 

                                                                                

[(869580,
  (['europeiska',
    'unionen',
    'måste',
    'organisera',
    'återsändandet',
    'av',
    'illegala',
    'invandrare',
    'på',
    'ett',
    'sätt',
    'som',
    'upprätthåller',
    'strängaste',
    'respekt',
    'för',
    'mänskliga',
    'rättigheter',
    'och',
    'mänsklig',
    'värdighet.'],
   ['the',
    'european',
    'union',
    'must',
    'organise',
    'the',
    'return',
    'of',
    'illegal',
    'immigrants',
    'in',
    'a',
    'way',
    'that',
    'maintains',
    'the',
    'strictest',
    'respect',
    'for',
    'human',
    'rights',
    'and',
    'human',
    'dignity.'])),
 (870940,
  (['vår',
    'framtid',
    'är',
    'beroende',
    'av',
    'vår',
    'förmåga',
    'att',
    'bekämpa',
    'terrorismen.'],
   ['our',
    'future',
    'depends',
    'on',
    'our',
    'ability',
    'to',
    'defeat',
    'terrorism.'])),
 (883860,
  (['i',
    'och',
    'med',
    'dagens',
    'debatt',
    'och',
    '

In [13]:
# Filter function for the joined RDD according to lab requirements
def filter(lines):
    #Filter to exclude empty/missing lines
    filtered_rdd = lines.filter(lambda x: len(x[1][0]) > 1 and len(x[1][1]) > 1)

    #Filter sentences with a small number (the threshold) of words
    threshold = 5
    filtered_rdd = filtered_rdd.filter(lambda x: len(x[1][0]) < threshold and len(x[1][1]) < threshold)

    # Filter pairs with the same number of words
    filtered_rdd = filtered_rdd.filter(lambda x: len(x[1][0]) == len(x[1][1]))
    return filtered_rdd

In [14]:
# Call on the filter function to filter the joined RDD
filtered_rdd = filter(sv_en_joined_rdd)

In [15]:
# See how it looks, if it was filtered properly
filtered_rdd.take(10)

                                                                                

[(142910,
  (['jag', 'förklarar', 'debatten', 'avslutad.'],
   ['the', 'debate', 'is', 'closed.'])),
 (37045,
  (['de', 'har', 'ansvaret', 'tillsammans.'],
   ['they', 'are', 'jointly', 'responsible.'])),
 (129940,
  (['jag', 'förklarar', 'debatten', 'avslutad.'],
   ['that', 'concludes', 'the', 'debate.'])),
 (225205, (['krisen', 'på', 'balkan.'], ['the', 'balkan', 'crisis.'])),
 (369620, (['tack,', 'herr', 'minister.'], ['thank', 'you,', 'minister.'])),
 (651850,
  (['men', 'europa', 'har', 'svarat.'],
   ['but', 'europe', 'has', 'responded.'])),
 (684985,
  (['det', 'är', 'deras', 'bidrag.'],
   ['that', 'is', 'their', 'contribution.'])),
 (708435,
  (['den', 'är', 'onödigt', 'komplicerad.'],
   ['it', 'is', 'unnecessarily', 'complicated.'])),
 (708975,
  (['varför', 'hålla', 'det', 'hemligt?'], ['why', 'keep', 'it', 'secret?'])),
 (749190, (['\xa0\xa0', '.'], ['\xa0\xa0', '.']))]

In [16]:
# Pair words in the order they appear in each sentence pair
word_pairs_rdd = filtered_rdd.map(lambda x: zip(x[1][0], x[1][1]))

In [17]:
# See how it looks
print(list(word_pairs_rdd.take(1)[0])) 

[Stage 17:>                                                         (0 + 1) / 1]

[('nej,', 'no,'), ('nej!)', 'no!)')]


                                                                                

In [18]:
# Count the frequency of each word-translation pair
words_rdd = word_pairs_rdd.flatMap(lambda line: line)
word_counts_rdd = words_rdd.map(lambda word: (word, 1))
word_freq_rdd = word_counts_rdd.reduceByKey(lambda a, b: a + b)

# Print 10 of the most frequently occurring pairs of words
top_pairs = word_freq_rdd.takeOrdered(10, key=lambda x: -x[1])
for (word, translation), count in top_pairs:
    print(f'{word} - {translation}: {count}')




avslutad. - closed.: 2534
är - is: 2380
jag - the: 1324
debatten - is: 1324
förklarar - debate: 1317
debatten - the: 1225
härmed - is: 1215
är - debate: 1187
(artikel - (rule: 893
det - that: 852


                                                                                

In [None]:
# The translations are not parfect, and the word pairs doesnt match in translation. However, the first two makes sens, and the last two. 
# The other ones are not translations of each other, but it looks like one of the most frequent short sentences perhaps could be 
# "härmed förklarar jag debatten avslutad", which is a completely different word structure on english - why the word pairs are a bit "scrambled"
# That is my conclusion at least.