DATA 603 - Platforms for Big Data Processing  
Assignment using MapReduce  
Student: Abdul Faisal Hussain Mohammed  
DOB: 06/24/2003



In [1]:
pip install pyspellchecker




In [2]:
import re
from multiprocessing import Pool
from collections import Counter
from spellchecker import SpellChecker

In [3]:
def mapper_word_count(line):
    words = re.findall(r'\b\w+\b', line.lower())
    return [(word, 1) for word in words]

def reducer_word_count(mapped_data):
    counter = Counter()
    for sublist in mapped_data:
        for word, count in sublist:
            counter[word] += count
    return counter


def mapper_non_english(line, english_words):
    words = re.findall(r'\b\w+\b', line.lower())
    return [(word, 1) for word in words if word not in english_words]

def reducer_non_english(mapped_data):
    counter = Counter()
    for sublist in mapped_data:
        for word, count in sublist:
            counter[word] += count
    return counter

In [4]:
if __name__ == "__main__":

    # --- File 1 word count ---
    with open("file1.txt", "r", encoding="utf-8") as f:
        lines1 = f.readlines()

    with Pool() as pool:
        mapped1 = pool.map(mapper_word_count, lines1)
    reduced1 = reducer_word_count(mapped1)

    print("=== Word Count in file1.txt ===")
    with open("file1_output.txt", "w", encoding="utf-8") as f_out:
        for word, count in reduced1.most_common(20):
            line = f"{word}: {count}"
            print(line)
            f_out.write(line + "\n")


=== Word Count in file1.txt ===
the: 157
i: 97
to: 93
you: 87
a: 80
of: 72
and: 64
he: 63
her: 47
that: 42
dark: 42
snape: 41
not: 40
in: 39
him: 37
was: 35
lord: 35
had: 31
it: 30
said: 30


In [7]:
with open("file2.txt", "r", encoding="utf-8") as f:
      lines2 = f.readlines()

spell = SpellChecker()
english_words = set(spell.word_frequency.words())

with Pool() as pool:
  mapped2 = pool.starmap(mapper_non_english, [(line, english_words) for line in lines2])
reduced2 = reducer_non_english(mapped2)

print("\n=== Non-English Word Count in file2.txt ===")
with open("file2_output.txt", "w", encoding="utf-8") as f_out:
  for word, count in reduced2.most_common(20):
      line = f"{word}: {count}"
      print(line)
      f_out.write(line + "\n")


=== Non-English Word Count in file2.txt ===
hermione: 22
ll: 19
malfoy: 19
weasley: 15
borgin: 10
hagrid: 5
wouldn: 4
muggle: 4
ve: 4
isn: 3
knockturn: 3
malkin: 2
narcissa: 2
eeylops: 2
weasleys: 2
didn: 2
twilfitt: 1
ev: 1
rything: 1
malfoys: 1
