In [1]:
DATA_PATH = 'reviews_devset.json'
PYTHON = 'python3'

In [2]:
! pip3 install mr3px mrjob



In [4]:
%%file chiSquaredReduce.py
from mrjob.job import MRJob
from mrjob.step import MRStep
# from mr3px.csvprotocol import CsvProtocol
import re

WORD_RE = re.split('[^a-zA-Z<>^|]+', line)

class ChiSquaredProcessor(MRJob):

    
    def mapper_count_title(self, _, line):
        result = next(csv.reader([line])) # extract columns from line

        title = result[6]
        author = result[7]
        nCheckouts = result[5]

        #skip sparse entries and header
        if title == "Title" or author == "" or title == "" or nCheckouts == "0": 
            return  

        yield (author,title), nCheckouts


    def mapper_count_title(self, _, line):
        # Tokenizes each line by using whitespaces, tabs, digits, and the specified characters as delimiters
        word_list = re.split('[^a-zA-Z<>^|]+', line)
        
        # For loop through the terms in pre-processed list
        for word in word_list:
            yield (word.lower(), 1)

    # The reducer now simply counts how often a title is checked out
    def reducer_title_reducer(self,authorTitle,checkouts):        
        count = 0
        author, title = authorTitle

        for rec in checkouts:
            count = count + int(rec)

        # CsvProtocol needs None key for output
        yield None, (author,title,count) 

    # The mapper of the second round sets as key the author
    def mapper_top_title(self, _, line):
        result = line # input from last round already a list of strings

        author = result[0]
        title = result[1]
        nCheckouts = result[2]

        yield author, (title,nCheckouts)

    # The combiner functions same as reducer, except that it emits a pair. 
    # This is necessary so the output of the combiner matches the mapper
    def combiner_top_title(self,author,titleCheckouts):
        mostCheckedOutTitle = ""
        mostCheckouts = 0

        for (title,nCheckouts) in titleCheckouts:
            if int(nCheckouts) > mostCheckouts:
                mostCheckouts = int(nCheckouts)
                mostCheckedOutTitle = title

        if mostCheckouts > 0: 
            yield author, (mostCheckedOutTitle,mostCheckouts)

     # The reducer now visits all titles for the author key and only emits 
     # a tuple with the title that was checked out the most for each author
    def reducer_top_title(self,author,titleCheckouts):
        mostCheckedOutTitle = ""
        mostCheckouts = 0

        for (title,nCheckouts) in titleCheckouts:
            if int(nCheckouts) > mostCheckouts:
                mostCheckouts = int(nCheckouts)
                mostCheckedOutTitle = title

        if mostCheckouts > 0: 
            #CsvProtocol needs None key for output
            yield None, (author, mostCheckedOutTitle) 

    def steps(self):
        return [
            MRStep(mapper   = self.mapper_count_title,
                   reducer  = self.reducer_title_reducer),
            MRStep(mapper   = self.mapper_top_title,
                   combiner = self.combiner_top_title,
                   reducer  = self.reducer_top_title)
        ]

if __name__ == '__main__':
    myjob1 = ChiSquaredProcessor()
    with myjob1.make_runner() as runner:
        runner.run()
        
        for key, value in myjob1.parse_output(runner.cat_output()):           
            print(key, value, "\n", end='')

Writing chiSquaredReduce.py


In [97]:
%%file chiSquaredReduce.py
from mrjob.job import MRJob
from mrjob.step import MRStep
# from mr3px.csvprotocol import CsvProtocol
import re
import json

stopwords = set()

class ChiSquaredProcessor(MRJob):
    def mapper_category_contains_term(self, _, line):
        # Mapper for counting the number of documents in which a term appears in a category
        data = json.loads(line)
        category = data.get('category', '')
        reviewText = data.get('reviewText', '')
        
        word_list = re.split('[^a-zA-Z<>^|]+', reviewText.lower())
        word_set = set([word for word in word_list if word not in stopwords and word.strip() != '' and len(word) > 1])
        yield category, list(word_set)


    def reducer_count_categories_contains_term(self, category, compromised_reviewText):
        # Reducer for counting the number of documents in which a term appears in a category
        compromised_reviews = list(compromised_reviewText)
        yield category, (len(compromised_reviews), compromised_reviews)

    def mapper_term_for_categories(self, category, count_reviews):
        # Mapper for counting the all different combinations of terms occuring in a all categories
        count, reviews = count_reviews
        for review in reviews:
            for term in review:
                yield (category, term), (1, count)

    def reducer_term_for_categories(self, category, counts):
        # Reducer for collecting each category of t
        documents = list(counts)
        count = documents[0][1]
        number = sum([n for n, _ in documents])
        yield category, (number, count)

    def mapper_3(self, term, category_counts):
        # Mapper for counting the all different combinations of terms occuring in a all categories
        for cat, cnt in category_counts:
            yield cat, (term, count)

        #yield term, category_counts

    def reducer_3(self, term, category_counts):
        token_counts = list(category_counts)
        #A =
        #C =


        yield term, token_counts

    def steps(self):
        return [
            MRStep(
                mapper   = self.mapper_category_contains_term,
                reducer  = self.reducer_count_categories_contains_term
            ),
            MRStep(
                mapper   = self.mapper_term_for_categories,
                reducer  = self.reducer_term_for_categories
            )
            #MRStep(
            #    mapper   = self.mapper3,
            #    reducer  = self.reducer3
            #)
        ]
   
if __name__ == '__main__':
    with open('./stopwords.txt', 'r') as f:
        stopwords = set(f.read().splitlines())


    myjob1 = ChiSquaredProcessor()
    with myjob1.make_runner() as runner:
        runner.run()
        
        for key, value in myjob1.parse_output(runner.cat_output()):           
            print(key, value, "\n", end='')


Overwriting chiSquaredReduce.py


Running a local MRjob 

In [98]:
! $PYTHON ./chiSquaredReduce.py $DATA_PATH > output.txt

No configs specified for inline runner


In [None]:
! $PYTHON ./chiSquaredReduce.py --hadoop-streaming-jar /opt/homebrew/opt/hadoop/libexec/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar \
-r hadoop hdfs:///user/maxkleinegger/reviews/reviews_devset.json --file ./chiSquaredReduce.py > output.txt