In [1]:
DATA_PATH = 'reviews_devset.json'
PYTHON = 'python3'

In [2]:
! pip3 install mr3px mrjob



In [4]:
%%file chiSquaredReduce.py
from mrjob.job import MRJob
from mrjob.step import MRStep
# from mr3px.csvprotocol import CsvProtocol
import re

WORD_RE = re.split('[^a-zA-Z<>^|]+', line)

class ChiSquaredProcessor(MRJob):

    # The mapper first extract and parses the tuple from Checkouts
    # Then it passes this to reducer, using as key the pair of author and title  
    def mapper_count_title(self, _, line):
        # Tokenizes each line by using whitespaces, tabs, digits, and the specified characters as delimiters
        word_list = re.split('[^a-zA-Z<>^|]+', line)
        
        # For loop through the terms in pre-processed list
        for word in word_list:
            yield (word.lower(), 1)

    # The reducer now simply counts how often a title is checked out
    def reducer_title_reducer(self,authorTitle,checkouts):        
        count = 0
        author, title = authorTitle

        for rec in checkouts:
            count = count + int(rec)

        # CsvProtocol needs None key for output
        yield None, (author,title,count) 

    # The mapper of the second round sets as key the author
    def mapper_top_title(self, _, line):
        result = line # input from last round already a list of strings

        author = result[0]
        title = result[1]
        nCheckouts = result[2]

        yield author, (title,nCheckouts)

    # The combiner functions same as reducer, except that it emits a pair. 
    # This is necessary so the output of the combiner matches the mapper
    def combiner_top_title(self,author,titleCheckouts):
        mostCheckedOutTitle = ""
        mostCheckouts = 0

        for (title,nCheckouts) in titleCheckouts:
            if int(nCheckouts) > mostCheckouts:
                mostCheckouts = int(nCheckouts)
                mostCheckedOutTitle = title

        if mostCheckouts > 0: 
            yield author, (mostCheckedOutTitle,mostCheckouts)

     # The reducer now visits all titles for the author key and only emits 
     # a tuple with the title that was checked out the most for each author
    def reducer_top_title(self,author,titleCheckouts):
        mostCheckedOutTitle = ""
        mostCheckouts = 0

        for (title,nCheckouts) in titleCheckouts:
            if int(nCheckouts) > mostCheckouts:
                mostCheckouts = int(nCheckouts)
                mostCheckedOutTitle = title

        if mostCheckouts > 0: 
            #CsvProtocol needs None key for output
            yield None, (author, mostCheckedOutTitle) 

    def steps(self):
        return [
            MRStep(mapper   = self.mapper_count_title,
                   reducer  = self.reducer_title_reducer),
            MRStep(mapper   = self.mapper_top_title,
                   combiner = self.combiner_top_title,
                   reducer  = self.reducer_top_title)
        ]

if __name__ == '__main__':
    myjob1 = ChiSquaredProcessor()
    with myjob1.make_runner() as runner:
        runner.run()
        
        for key, value in myjob1.parse_output(runner.cat_output()):           
            print(key, value, "\n", end='')

Writing chiSquaredReduce.py


In [13]:
%%file chiSquaredReduce.py
from mrjob.job import MRJob
from mrjob.step import MRStep
# from mr3px.csvprotocol import CsvProtocol
import re
import json

class ChiSquaredProcessor(MRJob):
    def mapper(self, _, line):
 
        data = json.loads(line)
        
        # Extract review text from JSON data
        review_text = data.get('reviewText', '')

        # Tokenize review text
        word_list = re.split('[^a-zA-Z<>^|]+', review_text)
              
        #for loop through the terms in pre-processed list
        for word in word_list:
            yield (word, 1)
                
    def reducer_count(self, word, counts):
        # sums up the values of all appearances of the term
        yield  (word, sum(counts))
    
    def steps(self):
        return [
            MRStep(mapper  = self.mapper,
                   reducer = self.reducer_count)
        ]
   
if __name__ == '__main__':
    myjob1 = ChiSquaredProcessor()
    with myjob1.make_runner() as runner:
        runner.run()
        
        for key, value in myjob1.parse_output(runner.cat_output()):           
            print(key, value, "\n", end='')


Overwriting chiSquaredReduce.py


Running a local MRjob 

In [14]:
! $PYTHON ./chiSquaredReduce.py $DATA_PATH > output.txt

No configs specified for inline runner


In [24]:
! $PYTHON ./chiSquaredReduce.py --hadoop-streaming-jar /opt/homebrew/opt/hadoop/libexec/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-r hadoop hdfs:///user/maxkleinegger/reviews/reviews_devset.json --file ./chiSquaredReduce.py > output.txt

No configs specified for hadoop runner
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  JobTracker metrics system already initialized!
  Error launching job , bad input path : File does not exist: /tmp/hadoop/mapred/staging/maxkleinegger1200901391/.staging/job_local1200901391_0001/files/chiSquaredReduce-1.py#chiSquaredReduce-1.py
Can't fetch history log; missing job ID
No counters found
Can't fetch history log; missing job ID
Can't fetch task logs; missing application ID
Traceback (most recent call last):
  File "/Users/maxkleinegger/Documents/TUWien/2024SS/DIC/tu-dic-ss24/./chiSquaredReduce.py", line 37, in <module>
    runner.run()
  File "/Users/maxkleinegger/anaconda3/lib/python3.10/site-packages/mrjob/runner.py", line 503, in run
    self._run()
  File "/Users/maxkleinegger/anaconda3/lib/python3.10/site-packages/mrjob/hadoop.py", line 329, in _run
    self._run_job_in_hadoop()
  File "/Users/maxkleinegger/anaconda3/lib/pytho