# Exercise 3 (MapReduce)

In [108]:
%%file favbrand.py
from mrjob.job import MRJob
from collections import Counter
from mr3px.csvprotocol import CsvProtocol

class MRFavouriteBrand(MRJob):
    OUTPUT_PROTOCOL = CsvProtocol
    def mapper(self, _, line):
        if line.startswith("event_time"):
            return
        cells = line.split(",")
        event_type = cells[1]
        product_id = cells[2]
        brand = cells[5]
        user_id = cells[7]
        if event_type != "purchase" or not brand or not product_id:
            return
        yield user_id, brand

    def reducer(self, user_id, brands):
        brand_counts = Counter(brands)
        fav_brand = brand_counts.most_common(1)[0][0]
        yield None, (user_id, fav_brand)


if __name__ == '__main__':
    MRFavouriteBrand.run()


Overwriting favbrand.py


In [119]:
! python favbrand.py /home/adbs20/shared/ecommerce/2019-Oct-short.csv >> output-favbrand-short.csv

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/favbrand.e0test.20200421.160325.770096
job output is in /tmp/favbrand.e0test.20200421.160325.770096/output
Streaming final output from /tmp/favbrand.e0test.20200421.160325.770096/output...
Removing temp directory /tmp/favbrand.e0test.20200421.160325.770096...


In [None]:
! HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/ python favbrand.py -r hadoop hdfs:///home/adbs20/shared/ecommerce/2019-Oct.csv > output-favbrand.csv

In [115]:
%%file brandcount.py
from mrjob.job import MRJob
from mrjob.step import MRStep
from collections import Counter
from datetime import datetime
from operator import itemgetter
from mr3px.csvprotocol import CsvProtocol
        
class MRBrandCountCombined(MRJob):
    OUTPUT_PROTOCOL = CsvProtocol
    
    def mapper_brand_users(self, _, line):
        if line.startswith("event_time"):
            return
        cells = line.split(",")
        event_time = cells[0]
        if not event_time:
            return
        try:
            event_time = datetime.strptime(event_time.split()[0], "%Y-%m-%d")
        except:
            return
        event_type = cells[1]
        product_id = cells[2]
        brand = cells[5]
        user_id = cells[7]
        if event_type != "purchase" or not brand or not product_id:
            return
        yield user_id,(event_time.month, brand)

    def reducer_fav_brand(self, user_id, values):
        values = list(values)
        months = map(itemgetter(0), values)
        if not (10 in months and 11 in months):
            return
        brands = map(itemgetter(1), values)
        brand_counts = Counter(brands)
        most_common_brands = brand_counts.most_common(2)
        if len(most_common_brands) > 1 and\
            most_common_brands[0][1] == most_common_brands[1][1]:
            return
        fav_brand = most_common_brands[0][0]
        yield user_id, fav_brand
    
    def inverse_mapper(self, key, value):
        yield value, key
    
    def reducer_brand_count(self, brand, users):
        yield None, (brand, len(list(users)))
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper_brand_users,
                  reducer=self.reducer_fav_brand),
            MRStep(mapper=self.inverse_mapper,
                  reducer=self.reducer_brand_count)
        ]

if __name__ == '__main__':
    MRBrandCountCombined.run()


Overwriting brandcount.py


In [116]:
! python brandcount.py /home/adbs20/shared/ecommerce/2019-Oct-short.csv /home/adbs20/shared/ecommerce/2019-Nov-short.csv >> output-brandcount-short.csv

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/brandcount.e0test.20200421.154503.834139
Running step 2 of 2...
job output is in /tmp/brandcount.e0test.20200421.154503.834139/output
Streaming final output from /tmp/brandcount.e0test.20200421.154503.834139/output...
"apple",3
"samsung",2
Removing temp directory /tmp/brandcount.e0test.20200421.154503.834139...


Running a Hadoop job

In [118]:
! HADOOP_HOME=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/ python brandcount.py -r hadoop hdfs:///home/adbs20/shared/ecommerce/2019-Oct.csv hdfs:///home/adbs20/shared/ecommerce/2019-Nov.csv > output-brandcount.csv

No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/bin...
Looking for hadoop binary in $PATH...
Found hadoop binary: /usr/bin/hadoop
Using Hadoop version 3.0.0
Looking for Hadoop streaming jar in /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/...
Found Hadoop streaming jar: /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar
Creating temp directory /tmp/favbrand.e0test.20200421.155801.981212
Copying local files to hdfs:///user/e0test/tmp/mrjob/favbrand.e0test.20200421.155801.981212/files/...
Running step 1 of 1...
  Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
  packageJobJar: [] [/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/jars/hadoop-streaming-3.0.0-cdh6.3.2.jar] /tmp/streamjob2757712408007014417.jar tmpDir=null
  Connecting to ResourceManager at c100.local/10.7.0.100:8032
  Connecting t