Make WordCount program for all the names in the dataset. Name is a word with the following properties:

The first character is not a digit (other characters can be digits).
The first character is uppercase, all the other characters that are letters are lowercase.
There are less than 0.5% occurrences of this word, when this word regardless to its case appears in the dataset and the condition (2) is not met.
Order by quantity, most popular first, output format:

name <tab> count

The result is the 5th line in the output.

In [1]:
%%writefile mapper.py

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode

def is_name(s):
    if not s:
        return False
    if s[0].isdigit():
        return False
    return True

def is_cond2_ch(ch):
    if ch.isalpha():
        return ch.islower()
    return True

def is_cond2(s):
    if not s[0].isupper():
        return False
    return all([is_cond2_ch(ch) for ch in s[1:]])

re_split = re.compile("\W*\s+\W*", flags=re.UNICODE)

for line in sys.stdin:
    try:
        article_id, text = unicode(line).split('\t', 1)
    except ValueError as e:
        continue
    words = re_split.split(' ' + text)
    for word in words:
        if not is_name(word):
            continue
        #print >> sys.stderr, "reporter:counter:Wiki stats,Total words,%d" % 1
        print("%s\t%d\t%d" % (word.lower(), 1, is_cond2(word)))

Overwriting mapper.py


In [2]:
%%writefile reducer.py

import sys

def reset_count(key):
    global current_key
    current_key = key
    global name_sum
    name_sum = 0
    global cond2_sum
    cond2_sum = 0
reset_count(None)

def commit_current():
    if current_key:
        if float(cond2_sum) / name_sum > 0.995:
            print("%s\t%d" % (current_key, cond2_sum))
                
for line in sys.stdin:
    try:
        key, count, count2 = line.strip().split('\t', 2)
        count = int(count)
        count2 = int(count2)
    except ValueError as e:
        continue
    if current_key != key:
        commit_current()
        reset_count(key)
    name_sum += count
    cond2_sum += count2

commit_current()

Overwriting reducer.py


In [3]:
%%bash

hdfs dfs -rm -r -skipTrash namecount_step1 > /dev/null
yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapred.jab.name="name count step 1" \
    -D mapreduce.job.reduces=8 \
    -files mapper.py,reducer.py \
    -mapper "python2 mapper.py" \
    -reducer "python2 reducer.py" \
    -input /data/wiki/en_articles_part \
    -output namecount_step1 > /dev/null 2>yarn1
cat yarn1 >/dev/stderr


19/06/22 11:12:01 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:12:01 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:12:02 INFO mapred.FileInputFormat: Total input files to process : 1
19/06/22 11:12:02 INFO mapreduce.JobSubmitter: number of splits:2
19/06/22 11:12:02 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1561045447920_0039
19/06/22 11:12:02 INFO impl.YarnClientImpl: Submitted application application_1561045447920_0039
19/06/22 11:12:02 INFO mapreduce.Job: The url to track the job: http://0de65cec5c1c:8088/proxy/application_1561045447920_0039/
19/06/22 11:12:02 INFO mapreduce.Job: Running job: job_1561045447920_0039
19/06/22 11:12:08 INFO mapreduce.Job: Job job_1561045447920_0039 running in uber mode : false
19/06/22 11:12:08 INFO mapreduce.Job:  map 0% reduce 0%
19/06/22 11:12:25 INFO mapreduce.Job:  map 43% reduce 0%
19/06/22 11:12:31 INFO mapreduce.Job:  map 61% reduce 0%
19/06/22 11:12:37 INFO 

In [4]:
%%writefile swapper.py

import sys
for line in sys.stdin:
    key, val = line.strip().split('\t', 1)
    print("%s\t%s" % (val, key))

Overwriting swapper.py


In [5]:
%%bash

hdfs dfs -rm -r -skipTrash namecount_step2 > /dev/null

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapred.jab.name="name count step 2" \
    -D mapreduce.job.reduces=1 \
    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
    -D mapred.text.key.comparator.options=-nr \
    -files swapper.py \
    -mapper "python2 swapper.py" \
    -reducer "python2 swapper.py" \
    -input namecount_step1 \
    -output namecount_step2 > /dev/null 2>yarn2
cat yarn2 >/dev/stderr


19/06/22 11:13:00 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:13:00 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:13:01 INFO mapred.FileInputFormat: Total input files to process : 8
19/06/22 11:13:01 INFO mapreduce.JobSubmitter: number of splits:8
19/06/22 11:13:01 INFO Configuration.deprecation: mapred.text.key.comparator.options is deprecated. Instead, use mapreduce.partition.keycomparator.options
19/06/22 11:13:01 INFO Configuration.deprecation: mapred.output.key.comparator.class is deprecated. Instead, use mapreduce.job.output.key.comparator.class
19/06/22 11:13:01 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1561045447920_0040
19/06/22 11:13:01 INFO impl.YarnClientImpl: Submitted application application_1561045447920_0040
19/06/22 11:13:01 INFO mapreduce.Job: The url to track the job: http://0de65cec5c1c:8088/proxy/application_1561045447920_0040/
19/06/22 11:13:01 INFO mapreduce.Job: Running job: 

In [6]:
%%bash

cat yarn1 >/dev/stderr
cat yarn2 >/dev/stderr
hdfs dfs -cat namecount_step2/part-00000 | sed -n '5p;6q'

french	5742


19/06/22 11:12:01 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:12:01 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
19/06/22 11:12:02 INFO mapred.FileInputFormat: Total input files to process : 1
19/06/22 11:12:02 INFO mapreduce.JobSubmitter: number of splits:2
19/06/22 11:12:02 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1561045447920_0039
19/06/22 11:12:02 INFO impl.YarnClientImpl: Submitted application application_1561045447920_0039
19/06/22 11:12:02 INFO mapreduce.Job: The url to track the job: http://0de65cec5c1c:8088/proxy/application_1561045447920_0039/
19/06/22 11:12:02 INFO mapreduce.Job: Running job: job_1561045447920_0039
19/06/22 11:12:08 INFO mapreduce.Job: Job job_1561045447920_0039 running in uber mode : false
19/06/22 11:12:08 INFO mapreduce.Job:  map 0% reduce 0%
19/06/22 11:12:25 INFO mapreduce.Job:  map 43% reduce 0%
19/06/22 11:12:31 INFO mapreduce.Job:  map 61% reduce 0%
19/06/22 11:12:37 INFO 