In [121]:
from pyspark.sql import SparkSession
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

In [122]:
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("hadoop_punnam_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",8)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [123]:
# Reading the english transcript
lines_en = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")

In [124]:
# Counting the no.of lines in the transcript
lines_en.count()

1862234

In [125]:
# Reading the swedish transcript
lines_sv = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")

In [126]:
# Counting the lines in the transcript
lines_sv.count()

1862234

In [127]:
# No. of partitions in english transcript
lines_en.getNumPartitions()

2

In [128]:
# No. of partitions in swedish transcript
lines_sv.getNumPartitions()

3

In [129]:
# Printing the sample 
lines_en.take(2)

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.']

In [130]:
# Printing the sample
lines_sv.take(2)

['Återupptagande av sessionen',
 'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.']

In [131]:
# Function for pre-processing. Lowercase text and tokenizing
def split_lower(line):
    return (line.lower()).split(' ')

In [132]:
# Pre-Processed english transcipts
lines_lower_en_count = lines_en.map(lambda x: split_lower(x))

In [133]:
# Counting the lines after pre-processing
lines_lower_en_count.count()

1862234

In [134]:
# Looking at the sample
lines_lower_en_count.take(2)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [135]:
# Pre-processed swedish transcripts
lines_lower_sv_count = lines_sv.map(lambda x: split_lower(x))

In [137]:
# Counting the lines after pre-processing
lines_lower_sv_count.count()

1862234

In [138]:
# Looking at the sample
lines_lower_sv_count.take(2)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.']]

In [141]:
# Flattening out the pre-processed text i.e from list of list of words to list of words
lines_lower_en = lines_en.flatMap(lambda x: split_lower(x))

In [142]:
# Taking a look at the output
lines_lower_en.take(10)

['resumption',
 'of',
 'the',
 'session',
 'i',
 'declare',
 'resumed',
 'the',
 'session',
 'of']

In [143]:
# # Flattening out the pre-processed text i.e from list of list of words to list of words
lines_lower_sv = lines_sv.flatMap(lambda x: split_lower(x))

In [144]:
# Taking a look at the sample
lines_lower_sv.take(10)

['återupptagande',
 'av',
 'sessionen',
 'jag',
 'förklarar',
 'europaparlamentets',
 'session',
 'återupptagen',
 'efter',
 'avbrottet']

In [146]:
# Computing 10 most frequent words in english transcript
word_count_en = lines_lower_en.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(10, key=lambda x: -x[1])

In [147]:
word_count_en

[('the', 3498375),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288401),
 ('in', 1085993),
 ('that', 797516),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522849)]

In [148]:
# Computing 10 most frequent words in swedish transcript
word_count_sv = lines_lower_sv.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(10, key=lambda x: -x[1])

In [149]:
word_count_sv

[('att', 1706293),
 ('och', 1344830),
 ('i', 1050774),
 ('det', 924866),
 ('som', 913276),
 ('för', 908680),
 ('av', 738068),
 ('är', 694381),
 ('en', 620310),
 ('vi', 539797)]

In [150]:
# Zipping the sentences along with the sentence index for english transcipt
zipped_lines_en = lines_en.zipWithIndex().map(lambda x: (x[1],x[0]))

In [151]:
# Zipping the sentences along with the sentence index for swedish transcipt
zipped_lines_sv = lines_sv.zipWithIndex().map(lambda x: (x[1],x[0]))

In [152]:
# Looking at the zipped output
zipped_lines_en.take(5)

[(0, 'Resumption of the session'),
 (1,
  'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'),
 (2,
  "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."),
 (3,
  'You have requested a debate on this subject in the course of the next few days, during this part-session.'),
 (4,
  "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.")]

In [153]:
# Looking at the zipped output
zipped_lines_sv.take(5)

[(0, 'Återupptagande av sessionen'),
 (1,
  'Jag förklarar Europaparlamentets session återupptagen efter avbrottet den 17 december. Jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester.'),
 (2,
  'Som ni kunnat konstatera ägde "den stora år 2000-buggen" aldrig rum. Däremot har invånarna i ett antal av våra medlemsländer drabbats av naturkatastrofer som verkligen varit förskräckliga.'),
 (3,
  'Ni har begärt en debatt i ämnet under sammanträdesperiodens kommande dagar.'),
 (4,
  'Till dess vill jag att vi, som ett antal kolleger begärt, håller en tyst minut för offren för bl.a. stormarna i de länder i Europeiska unionen som drabbats.')]

In [86]:
# Joining the two RDDs to form a pair of zipped sentences
paired_lines = zipped_lines_en.join(zipped_lines_sv)

In [154]:
# Looking at the output
paired_lines.take(2)

[(0, ('Resumption of the session', 'Återupptagande av sessionen')),
 (5,
  ("Please rise, then, for this minute' s silence.",
   'Jag ber er resa er för en tyst minut.'))]

In [155]:
# Filtering out the pairs that have empty/missing corresponding sentence
paired_lines_1 = paired_lines.filter(lambda x: (len(x[1][1].strip()) > 0 and len(x[1][0].strip()) > 0))

In [156]:
# Filtering out the pairs that have sentences with no.of words < 10 and equal word length
paired_lines_2 = paired_lines_1.filter(lambda x: len(x[1][1].split(' ')) < 10 and len(x[1][0].split(' ')) < 10).filter(lambda y: len(y[1][1].split(' ')) == len(y[1][0].split(' ')))

In [157]:
# Mapping the word pairs from previous step
paired_lines_3 = paired_lines_2.flatMap(lambda x: list(zip(x[1][1].split(' '), x[1][0].split(' '))))

In [158]:
# Counting no. of occurrences of each word pair
paired_lines_4 = paired_lines_3.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(10, key=lambda x: -x[1])

In [159]:
# Top 10 frequently occuring words
paired_lines_4

[(('är', 'is'), 9766),
 (('Jag', 'I'), 4562),
 (('Vi', 'We'), 4183),
 (('avslutad.', 'closed.'), 2964),
 (('en', 'a'), 2734),
 (('och', 'and'), 2695),
 (('inte', 'not'), 2601),
 (('(Applåder)', '(Applause)'), 2543),
 (('Det', 'That'), 2359),
 (('Det', 'It'), 2314)]

In [160]:
spark_context.stop()