## Compute tf per word+article_id pair

In [78]:
%%writefile mapper.py

from collections import defaultdict

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode

stop_words = set()
with open("stop_words_en.txt") as f:
    stop_words.update(word.strip() for word in f.readlines())

for line in sys.stdin:
    try:
        article_id, text = unicode(line.strip()).split('\t', 1)
    except ValueError as e:
        continue

    words = [word.lower() for word in re.split("\W*\s+\W*", text, flags=re.UNICODE)
             if word.lower() not in stop_words]

    word_counts = defaultdict(int)
    for word in words:
        word_counts[word] += 1
    total_words = len(words)

    for word in word_counts:
        tf = word_counts[word] / total_words
        print '%s\t%s,%d,%d' % (word, article_id, word_counts[word], total_words)

Overwriting mapper.py


In [86]:
%%writefile reducer.py

import sys
import math

prev_word = None
articles = {}

def print_tf_idf(word, articles):
    articles_with_word = len(articles)
    for article_id in articles:
        word_count_in_article, total_words_in_article = articles[article_id]
        tf = float(word_count_in_article) / float(total_words_in_article)
        idf = 1 / math.log(1 + articles_with_word)
        print '%s,%s\t%.10f' % (word, article_id, tf * idf)

for line in sys.stdin:
    try:
        word, value = line.strip().split('\t', 1)
        article_id, word_count, total_words = value.split(',')
    except ValueError as e:
        continue
        
    if prev_word != word:
        if prev_word is not None:
            print_tf_idf(prev_word, articles)
            articles = {}
        prev_word = word

    articles[article_id] = (word_count, total_words)

Overwriting reducer.py


## Hadoop jobs

In [89]:
%%bash

NUM_REDUCERS=8

OUT_DIR="td_idf_result"$(date +"%s%6N")

hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapred.jab.name="Streaming td*idf" \
    -D mapreduce.job.reduces=${NUM_REDUCERS} \
    -D mapreduce.partition.keycomparator.options=-k1,1 \
    -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
    -files mapper.py,reducer.py,/datasets/stop_words_en.txt \
    -mapper "python mapper.py" \
    -reducer "python reducer.py" \
    -input /data/wiki/en_articles_part \
    -output ${OUT_DIR} > /dev/null
    
hdfs dfs -cat ${OUT_DIR}/* | egrep "^labor,12$(printf '\t').*" | cut -f 2

0.0003504690


18/05/13 08:46:40 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/05/13 08:46:41 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/05/13 08:46:41 INFO mapred.FileInputFormat: Total input files to process : 1
18/05/13 08:46:41 INFO mapreduce.JobSubmitter: number of splits:2
18/05/13 08:46:41 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1526123822060_0050
18/05/13 08:46:41 INFO impl.YarnClientImpl: Submitted application application_1526123822060_0050
18/05/13 08:46:41 INFO mapreduce.Job: The url to track the job: http://034ad37a251b:8088/proxy/application_1526123822060_0050/
18/05/13 08:46:41 INFO mapreduce.Job: Running job: job_1526123822060_0050
18/05/13 08:46:48 INFO mapreduce.Job: Job job_1526123822060_0050 running in uber mode : false
18/05/13 08:46:48 INFO mapreduce.Job:  map 0% reduce 0%
18/05/13 08:47:04 INFO mapreduce.Job:  map 40% reduce 0%
18/05/13 08:47:10 INFO mapreduce.Job:  map 60% reduce 0%
18/05/13 08:47:16 INFO 