In [8]:
# We will need these so we can reload modules as we modify them
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
%%writefile MRJob5_4.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class stripes(MRJob):
    
    """
    Build stripes
    - Read in basis words from basisWords.txt
    - Emit stripes where the key and each value's key is in the basis
    """
    
    def mapper_buildStripe_init(self):
        self.vocab = set()
        with open('basisWords.txt','r') as myfile:
            for word in myfile:
                self.vocab.add(word.strip())
        
    def mapper_buildStripe(self, _, line):
        fields = line.strip().split('\t')
        words = fields[0].lower().split()
        wordList = sorted(list(set(words)))
        for index1 in range(len(wordList)-1):
            stripe = {}
            if wordList[index1] in self.vocab:
                for index2 in range(index1+1,len(wordList)):
                    if wordList[index2] in self.vocab:
                        stripe[wordList[index2]] = 1
            if len(stripe) > 0:
                yield wordList[index1], stripe
            
    def combiner_buildStripe(self, key, values):
        stripe = {}
        for val in values:
            for word in val:
                if word in stripe:
                    stripe[word] += val[word]
                else:
                    stripe[word] = val[word]
        yield key, stripe
        
    def reducer_buildStripe(self, key, values):
        stripe = {}
        for val in values:
            for word in val:
                if word in stripe:
                    stripe[word] += val[word]
                else:
                    stripe[word] = val[word]
        yield key, stripe
    
            
        
    """
    Multi-step pipeline definitions
    Based on user input when calling runner function
    """
    def steps(self):
        return [
            MRStep(mapper_init=self.mapper_buildStripe_init,
                   mapper=self.mapper_buildStripe,
                   combiner=self.combiner_buildStripe,
                   reducer=self.reducer_buildStripe,
                   jobconf={'mapred.reduce.tasks': 2})
        ]
    

if __name__ == '__main__':
    stripes.run()

Overwriting MRJob5_4.py


In [16]:
from MRJob5_4 import stripes

def runJob5_4(filename):

    mr_job = stripes(args=[filename, '--file', 'basisWords.txt'])
    #mr_job = stripes(args=[filename, '-r', 'hadoop', '--hadoop-home', '/usr/', '--file', 'basisWords.txt'])
    output = []

    with mr_job.make_runner() as runner: 
        # Run MRJob
        runner.run()

        # Write stream_output to file
        for line in runner.stream_output():
            output.append(mr_job.parse_output_line(line))
    
    return output

In [23]:
myfile = './filtered-5Grams/googlebooks-eng-all-5gram-20090715-0-filtered.txt'
#myfile = './filtered-5Grams/short-5gram.txt'

# output = runJob5_4(myfile)
for item in output:
    print item[0], item[1], '\n'

among {'human': 2, 'brought': 1, 'something': 1, 'want': 1, 'sense': 1, 'subject': 1, 'god': 2, 'least': 3, 'better': 1, 'going': 1, 'interest': 1, 'themselves': 9, 'ever': 3, 'body': 1, 'took': 2, 'known': 5, 'law': 1, 'states': 2, 'kind': 1, 'whether': 2, 'value': 1, 'common': 7, 'become': 1, 'side': 1} 

another {'body': 1, 'often': 2, 'certain': 2, 'nothing': 1, 'done': 1, 'want': 2, 'sense': 1, 'subject': 2, 'god': 1, 'away': 1, 'question': 2, 'going': 1, 'themselves': 3, 'ever': 3, 'development': 2, 'human': 3, 'effect': 2, 'here': 2, 'took': 1, 'cannot': 1, 'known': 1, 'law': 1, 'change': 1, 'kind': 5, 'necessary': 1, 'study': 1, 'value': 1, 'person': 10, 'common': 1, 'become': 1, 'went': 2, 'side': 4, 'once': 1} 

away {'often': 2, 'certain': 1, 'mind': 2, 'states': 1, 'done': 3, 'want': 1, 'subject': 1, 'god': 4, 'question': 1, 'enough': 3, 'going': 5, 'ever': 1, 'body': 1, 'took': 6, 'hand': 2, 'water': 1, 'cannot': 2, 'nothing': 2, 'off': 1, 'whether': 1, 'went': 10, 'fact':