In [1]:
import urllib.request as urllib
urllib.urlretrieve ("https://kannu.csc.fi/index.php/s/2FKIuCYWfzdc2hn/download", "HP.txt")
urllib.urlretrieve ("https://kannu.csc.fi/index.php/s/oU72rsNmnDqJn0d/download", "GoT.txt")

('GoT.txt', <http.client.HTTPMessage at 0x7f54089677b8>)

In [2]:
from pyspark import SparkContext
sc = SparkContext()

## Word Count

#### Load the RDD from the text file

In [4]:
rdd_input = sc.textFile('HP.txt')
rdd_input.collect()

['The place where things are hidden',
 'If you have to ask you will never know',
 'If you know you need only ask',
 'The hidden room of requirement']

In [5]:
# Split the text into words and flatten the results. Why?
words = rdd_input.flatMap(lambda line: line.split())
words.collect()

['The',
 'place',
 'where',
 'things',
 'are',
 'hidden',
 'If',
 'you',
 'have',
 'to',
 'ask',
 'you',
 'will',
 'never',
 'know',
 'If',
 'you',
 'know',
 'you',
 'need',
 'only',
 'ask',
 'The',
 'hidden',
 'room',
 'of',
 'requirement']

#### Map the words in such a way, so that they can be counted later by aggregation

In [9]:
#liitetään numero yksi laskemista varten
words_map = words.map(lambda x: (x,1))
words_map.collect()

[('The', 1),
 ('place', 1),
 ('where', 1),
 ('things', 1),
 ('are', 1),
 ('hidden', 1),
 ('If', 1),
 ('you', 1),
 ('have', 1),
 ('to', 1),
 ('ask', 1),
 ('you', 1),
 ('will', 1),
 ('never', 1),
 ('know', 1),
 ('If', 1),
 ('you', 1),
 ('know', 1),
 ('you', 1),
 ('need', 1),
 ('only', 1),
 ('ask', 1),
 ('The', 1),
 ('hidden', 1),
 ('room', 1),
 ('of', 1),
 ('requirement', 1)]

#### Apply reduceByKey to complete the counting

In [10]:
#suoritetaan agregaatio
words_count = words_map.reduceByKey(lambda a,b: a+b)
words_count.collect()

[('where', 1),
 ('ask', 2),
 ('only', 1),
 ('hidden', 2),
 ('things', 1),
 ('room', 1),
 ('are', 1),
 ('never', 1),
 ('know', 2),
 ('of', 1),
 ('have', 1),
 ('The', 2),
 ('requirement', 1),
 ('you', 4),
 ('If', 2),
 ('place', 1),
 ('will', 1),
 ('to', 1),
 ('need', 1)]

#### Sort them in the order of counts

In [12]:
#words_count_sorted_freq = words_count.sortBy(lambda x: x[1])
words_count_sorted_freq = words_count.sortBy(lambda x: -x[1]) # Descending
words_count_sorted_freq.collect()

[('you', 4),
 ('ask', 2),
 ('hidden', 2),
 ('know', 2),
 ('The', 2),
 ('If', 2),
 ('where', 1),
 ('only', 1),
 ('things', 1),
 ('room', 1),
 ('are', 1),
 ('never', 1),
 ('of', 1),
 ('have', 1),
 ('requirement', 1),
 ('place', 1),
 ('will', 1),
 ('to', 1),
 ('need', 1)]

## Inverted Index

In [19]:
# name of the file is GoT.txt
# data cleaning and preparation: get rid of the full stop at the end of sentences, convert everything to lowercase

rdd_got = sc.textFile('GoT.txt')
rdd_got.collect()

['A Lannister always pays his debts.',
 'The night is dark and full of terrors.',
 'The next time you raise a hand to me will be the last time you have hands.',
 'When you play the Game of Thrones you win or you die.',
 'You know nothing Jon Snow.']

In [20]:
rdd_got = rdd_got.map(lambda line: line.replace('.', '')).map(lambda line: line.lower())
rdd_got.collect()

['a lannister always pays his debts',
 'the night is dark and full of terrors',
 'the next time you raise a hand to me will be the last time you have hands',
 'when you play the game of thrones you win or you die',
 'you know nothing jon snow']

In [21]:
rdd_got = rdd_got.zipWithIndex()  # Generate the positional index
rdd_got.collect()

[('a lannister always pays his debts', 0),
 ('the night is dark and full of terrors', 1),
 ('the next time you raise a hand to me will be the last time you have hands',
  2),
 ('when you play the game of thrones you win or you die', 3),
 ('you know nothing jon snow', 4)]

In [22]:
rdd_parsed = rdd_got.map(lambda x: (x[0].split(), x[1]))  # Split the words
print(rdd_parsed.collect())

[(['a', 'lannister', 'always', 'pays', 'his', 'debts'], 0), (['the', 'night', 'is', 'dark', 'and', 'full', 'of', 'terrors'], 1), (['the', 'next', 'time', 'you', 'raise', 'a', 'hand', 'to', 'me', 'will', 'be', 'the', 'last', 'time', 'you', 'have', 'hands'], 2), (['when', 'you', 'play', 'the', 'game', 'of', 'thrones', 'you', 'win', 'or', 'you', 'die'], 3), (['you', 'know', 'nothing', 'jon', 'snow'], 4)]


In [23]:
# Add the index of the document to each of the words
def add_index_to_words(item):
    result = []
    for word in item[0]:
        result.append((word, item[1]))
    return result

In [24]:
# rdd_parsed_indexed = rdd_parsed.flatMap(lambda x: map(lambda y: (y, x[1]), x[0]))
rdd_parsed_indexed = rdd_parsed.flatMap(add_index_to_words)
rdd_parsed_indexed.collect()

[('a', 0),
 ('lannister', 0),
 ('always', 0),
 ('pays', 0),
 ('his', 0),
 ('debts', 0),
 ('the', 1),
 ('night', 1),
 ('is', 1),
 ('dark', 1),
 ('and', 1),
 ('full', 1),
 ('of', 1),
 ('terrors', 1),
 ('the', 2),
 ('next', 2),
 ('time', 2),
 ('you', 2),
 ('raise', 2),
 ('a', 2),
 ('hand', 2),
 ('to', 2),
 ('me', 2),
 ('will', 2),
 ('be', 2),
 ('the', 2),
 ('last', 2),
 ('time', 2),
 ('you', 2),
 ('have', 2),
 ('hands', 2),
 ('when', 3),
 ('you', 3),
 ('play', 3),
 ('the', 3),
 ('game', 3),
 ('of', 3),
 ('thrones', 3),
 ('you', 3),
 ('win', 3),
 ('or', 3),
 ('you', 3),
 ('die', 3),
 ('you', 4),
 ('know', 4),
 ('nothing', 4),
 ('jon', 4),
 ('snow', 4)]

In [25]:
# use the same concept from word count to provide each key a default initial count of 1
rdd_doc_countmap = rdd_parsed_indexed.map(lambda item: (item, 1))
rdd_doc_countmap.collect()

[(('a', 0), 1),
 (('lannister', 0), 1),
 (('always', 0), 1),
 (('pays', 0), 1),
 (('his', 0), 1),
 (('debts', 0), 1),
 (('the', 1), 1),
 (('night', 1), 1),
 (('is', 1), 1),
 (('dark', 1), 1),
 (('and', 1), 1),
 (('full', 1), 1),
 (('of', 1), 1),
 (('terrors', 1), 1),
 (('the', 2), 1),
 (('next', 2), 1),
 (('time', 2), 1),
 (('you', 2), 1),
 (('raise', 2), 1),
 (('a', 2), 1),
 (('hand', 2), 1),
 (('to', 2), 1),
 (('me', 2), 1),
 (('will', 2), 1),
 (('be', 2), 1),
 (('the', 2), 1),
 (('last', 2), 1),
 (('time', 2), 1),
 (('you', 2), 1),
 (('have', 2), 1),
 (('hands', 2), 1),
 (('when', 3), 1),
 (('you', 3), 1),
 (('play', 3), 1),
 (('the', 3), 1),
 (('game', 3), 1),
 (('of', 3), 1),
 (('thrones', 3), 1),
 (('you', 3), 1),
 (('win', 3), 1),
 (('or', 3), 1),
 (('you', 3), 1),
 (('die', 3), 1),
 (('you', 4), 1),
 (('know', 4), 1),
 (('nothing', 4), 1),
 (('jon', 4), 1),
 (('snow', 4), 1)]

In [26]:
#key on tuple
rdd_doc_wordcount = rdd_doc_countmap.reduceByKey(lambda a,b : a+b)
rdd_doc_wordcount.collect()

[(('last', 2), 1),
 (('always', 0), 1),
 (('and', 1), 1),
 (('game', 3), 1),
 (('lannister', 0), 1),
 (('his', 0), 1),
 (('thrones', 3), 1),
 (('the', 3), 1),
 (('know', 4), 1),
 (('play', 3), 1),
 (('you', 3), 3),
 (('the', 1), 1),
 (('or', 3), 1),
 (('jon', 4), 1),
 (('have', 2), 1),
 (('of', 3), 1),
 (('to', 2), 1),
 (('night', 1), 1),
 (('the', 2), 2),
 (('a', 0), 1),
 (('me', 2), 1),
 (('you', 2), 2),
 (('hands', 2), 1),
 (('is', 1), 1),
 (('time', 2), 2),
 (('full', 1), 1),
 (('raise', 2), 1),
 (('next', 2), 1),
 (('win', 3), 1),
 (('debts', 0), 1),
 (('a', 2), 1),
 (('of', 1), 1),
 (('be', 2), 1),
 (('terrors', 1), 1),
 (('snow', 4), 1),
 (('dark', 1), 1),
 (('when', 3), 1),
 (('die', 3), 1),
 (('you', 4), 1),
 (('hand', 2), 1),
 (('will', 2), 1),
 (('nothing', 4), 1),
 (('pays', 0), 1)]

In [None]:
rdd_wordcount_doc = rdd_doc_wordcount.map(lambda item: (item[0][0], [item[0][1], item[1]]))
rdd_grouped = rdd_wordcount_doc.groupByKey()
rdd_grouped = rdd_grouped.mapValues(list)
rdd_sorted = rdd_grouped.sortByKey()
rdd_sorted.collect()