In [1]:
%config IPCompleter.greedy=True

In [1]:
%%writefile cargaFicheros.py
#!/usr/bin/python
# Copyright 2009-2010 Yelp
# Copyright 2013 David Marin
# Copyright 2018 Yelp
# Copyright 2019 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Determine the most used word in the input, ignoring common "stop" words.
Shows how to do a multi-step job, and how to load a support file
from the same directory.
"""
import re

from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol
from mrjob.step import MRStep

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):
    FILES = ['stop_words.txt']

    OUTPUT_PROTOCOL = JSONValueProtocol

    def configure_args(self):
        super(MRMostUsedWord, self).configure_args()

        # allow for alternate stop words file
        self.add_file_arg(
            '--stop-words-file',
            dest='stop_words_file',
            default=None,
            help='alternate stop words file. lowercase words, one per line',
        )

    def mapper_init(self):
        stop_words_path = self.options.stop_words_file or 'stop_words.txt'

        with open(stop_words_path) as f:
            self.stop_words = set(line.strip() for line in f)

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            word = word.lower()
            if word not in self.stop_words:
                yield (word, 1)

    def combiner_count_words(self, word, counts):
        # sum the words we've seen so far
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is so we can easily use Python's max() function.
        yield None, (sum(counts), word)

    # discard the key; it is just None
    """def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        try:
            yield max(word_count_pairs)
        except ValueError:
            pass"""

    def steps(self):
        return [
            MRStep(mapper_init=self.mapper_init,
                   mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            #MRStep(reducer=self.reducer_find_max_word)
        ]


if __name__ == '__main__':
    MRMostUsedWord.run()

Overwriting cargaFicheros.py


In [2]:
!python cargaFicheros.py ingles.txt

[57,"'"]
[1,"'as"]
[1,"'benedick"]
[1,"'em"]
[1,"'king"]
[1,"'li"]
[1,"'n"]
[1,"'never"]
[1,"'no"]
[1,"'none"]
[1,"'nus"]
[1,"'o"]
[1,"'ri"]
[1,"'scape"]
[1,"'sleep"]
[2,"'t"]
[1,"'thrice"]
[11,"'tis"]
[2,"'twas"]
[1,"'when'"]
[1,"'you"]
[2,"0"]
[1,"000"]
[81,"1"]
[1,"101"]
[1,"1039"]
[3,"105"]
[1,"108"]
[1,"10th"]
[1,"111"]
[1,"115"]
[1,"117"]
[1,"119"]
[1,"122"]
[1,"127"]
[1,"129"]
[1,"131"]
[1,"135"]
[1,"137"]
[3,"141"]
[4,"1430"]
[1,"145"]
[1,"15"]
[1,"150"]
[1,"1500"]
[1,"151"]
[2,"153"]
[1,"154"]
[1,"1564"]
[1,"1568"]
[1,"157"]
[1,"1582"]
[1,"1589"]
[1,"159"]
[1,"1596"]
[1,"1602"]
[1,"1607"]
[2,"1608"]
[2,"1616"]
[1,"1623"]
[1,"163"]
[1,"168"]
[1,"170"]
[1,"175"]
[1,"178"]
[1,"181"]
[2,"183"]
[1,"187"]
[1,"188"]
[1,"1887"]
[2,"19"]
[1,"191"]
[1,"192"]
[2,"194"]
[1,"197"]
[1,"1998"]
[4,"1st"]
[42,"2"]
[2,"20"]
[1,"200"]
[1,"2001"]
[1,"2008"]
[1,"2018"]
[1,"204"]
[2,"211"]
[1,"213"]
[1,"218"]
[1,"22"]
[1,"222"]
[1,"225"]
[16,"226"]
[2,"228"]
[52,"229"]
[1,"232"]
[42,"234"]
[11,"235

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\MRSANC~1\AppData\Local\Temp\cargaFicheros.mrsanchez.20200901.155932.761009
Running step 1 of 1...
job output is in C:\Users\MRSANC~1\AppData\Local\Temp\cargaFicheros.mrsanchez.20200901.155932.761009\output
Streaming final output from C:\Users\MRSANC~1\AppData\Local\Temp\cargaFicheros.mrsanchez.20200901.155932.761009\output...
Removing temp directory C:\Users\MRSANC~1\AppData\Local\Temp\cargaFicheros.mrsanchez.20200901.155932.761009...
