# Relative Frequencies example
## Relative Frequencies
__Data Intensive Text Processing__   
Lin and Dyer   
3.3 COMPUTING RELATIVE FREQUENCIES   
(ported to Spark RDD API)

In [1]:
import re
import ast
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1]:
# start Spark Session (RUN THIS CELL AS IS)
from pyspark.sql import SparkSession
app_name = "relative_frequencies"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [4]:
%%time
DATA_toy = sc.parallelize(['dog aardvark pig banana','bear zebra pig'])
DATA_toy.glom().collect()
# using glom, we can see that there were as many partitions created as there are cores on this machine

CPU times: user 10 ms, sys: 10 ms, total: 20 ms
Wall time: 168 ms


In [6]:
DATA_F1 = sc.textFile("/media/notebooks/Assignments/HW3/master/data/googlebooks-eng-all-5gram-20090715-0-filtered.txt")\
            .map(lambda x: x.split('\t')[0]).cache()
DATA_F1.take(5)

['A BILL FOR ESTABLISHING RELIGIOUS',
 'A Biography of General George',
 'A Case Study in Government',
 'A Case Study of Female',
 'A Case Study of Limited']

In [5]:
DATA = sc.textFile("/media/notebooks/Assignments/HW3/master/data").map(lambda x: x.split('\t')[0]).cache()
DATA.take(5)

['A CATALOGUE OF THE PATHOLOGICAL',
 'A CT scan shows a',
 'A Case of Frustrated Take',
 'A Catalogue of Books and',
 'A Celebration of the First']

## Version 1

In [45]:
from itertools import combinations
from operator import add
from collections import defaultdict

def makePairs(row):
    words = row.split(' ')
    for w1, w2 in combinations(words, 2):
        yield((w1,"*"),1)
        yield((w1,w2),1)
        
        
def partitionByWord(x):
    return hash(x[0][0])

def partMapper(seq):
    currPair, currWord, pairTotal, wordTotal = None, None, 0, 0
    for r in list(seq):
        w1, w2 = r[0][0], r[0][1]
        if w2 == "*":
            if w1 != currWord: 
                wordTotal = 0
                currWord = w1
            wordTotal += r[1]    
        else:
            pairTotal += r[1]
        
            if currPair != r[0]: 
                yield(w1+" - "+w2, pairTotal/wordTotal)
                pairTotal = 0
                currPair = r[0]

In [36]:
RDD_v1_sm = DATA_toy.flatMap(makePairs)\
          .repartitionAndSortWithinPartitions(numPartitions=2, 
                                                  ascending=True, 
                                                  partitionFunc=partitionByWord,
                                                  keyfunc=lambda x: (x[0],x[1]))\
          .mapPartitions(partMapper)

In [43]:
%%time
RDD_v1_sm.glom().collect()

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 110 ms


[[('bear - pig', 0.5),
  ('bear - zebra', 0.5),
  ('dog - aardvark', 0.3333333333333333),
  ('dog - banana', 0.3333333333333333),
  ('dog - pig', 0.3333333333333333),
  ('pig - banana', 1.0)],
 [('aardvark - banana', 0.5), ('aardvark - pig', 0.5), ('zebra - pig', 1.0)]]

In [46]:
RDD_v1_med = DATA_F1.flatMap(makePairs)\
          .repartitionAndSortWithinPartitions(numPartitions=2, 
                                                  ascending=True, 
                                                  partitionFunc=partitionByWord,
                                                  keyfunc=lambda x: (x[0],x[1]))\
          .mapPartitions(partMapper)

In [47]:
%%time
RDD_v1_med.take(5)

CPU times: user 10 ms, sys: 10 ms, total: 20 ms
Wall time: 11.7 s


[('C - a', 0.02),
 ('C - activation', 0.04),
 ('C - and', 0.02),
 ('C - by', 0.1),
 ('C - circumference', 0.02)]

<img src="RelFreq_repartition.png">

## Version 2

In [38]:
from itertools import combinations
from operator import add
from collections import defaultdict


def makePairsWithinPartition(seq):
    pairsDict = defaultdict(int)
    for row in seq:
        words = row.split(' ')
        for w1, w2 in combinations(words, 2):
            pairsDict[(w1,"*")]+=1
            pairsDict[(w1,w2)] += 1       
    for k,v in pairsDict.items():
        yield(k,v)
    
def partitionByWord(x):
    return hash(x[0][0])


def calcRelFreq(seq):    
    
    seq = sorted(seq, key=lambda tup: (tup[0][0], tup[0][1]))
    
    currPair, currWord, pairTotal, wordTotal = None, None, 0, 0
    for r in list(seq):
        w1, w2 = r[0][0], r[0][1]
        if w2 == "*":
            if w1 != currWord: 
                wordTotal = 0
                currWord = w1
            wordTotal += r[1]    
        else:
            pairTotal += r[1]
        
            if currPair != r[0]: 
                yield(w1+" - "+w2, pairTotal/wordTotal)
                pairTotal = 0
                currPair = r[0]

In [39]:
RDD_sm = DATA_toy.mapPartitions(makePairsWithinPartition)\
              .reduceByKey(add, numPartitions=2, partitionFunc=partitionByWord)\
              .mapPartitions(calcRelFreq, True)\

In [40]:
%%time
RDD_sm.glom().collect()

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 169 ms


[[('bear - pig', 0.5),
  ('bear - zebra', 0.5),
  ('dog - aardvark', 0.3333333333333333),
  ('dog - banana', 0.3333333333333333),
  ('dog - pig', 0.3333333333333333),
  ('pig - banana', 1.0)],
 [('aardvark - banana', 0.5), ('aardvark - pig', 0.5), ('zebra - pig', 1.0)]]

In [41]:
RDD_med = DATA_F1.mapPartitions(makePairsWithinPartition)\
              .reduceByKey(add,numPartitions=2, partitionFunc=partitionByWord)\
              .mapPartitions(calcRelFreq, True)\

In [42]:
%%time
RDD_med.take(5)

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 5 s


[('C - a', 0.04),
 ('C - activation', 0.02),
 ('C - and', 0.1),
 ('C - by', 0.02),
 ('C - circumference', 0.02)]

<img src="RelFreq_reduceByKey.png">

In [17]:
RDD_lg = DATA.flatMap(makePairs)\
          .reduceByKey(add,partitionFunc=partitionByWord)\
          .mapPartitions(calcRelFreq, True)\
          .cache()

In [18]:
%%time
RDD_lg.take(5)

CPU times: user 100 ms, sys: 50 ms, total: 150 ms
Wall time: 10min 49s


[('o - A', 0.0006691201070592171),
 ('o - ARNO', 0.00033456005352960856),
 ('o - Adrianne', 0.00033456005352960856),
 ('o - Affairs', 0.00033456005352960856),
 ('o - America', 0.00033456005352960856)]

## Version 3
Can we get the best of both worlds? Using the framework to combine as well as do secondary sort for us?   
TODO

In [None]:
composite keys!

In [None]:
# TODO: calculate averages example with aggragateByKey
aTuple = (0,0) # As of Python3, you can't pass a literal sequence to a function.
rdd1 = rdd1.aggregateByKey(aTuple, lambda a,b: (a[0] + b,    a[1] + 1),
                                       lambda a,b: (a[0] + b[0], a[1] + b[1]))
finalResult = rdd1.mapValues(lambda v: v[0]/v[1]).collect()

'''
First lambda expression for Within-Partition Reduction Step::
a: is a TUPLE that holds: (runningSum, runningCount).
b: is a SCALAR that holds the next Value

Second lambda expression for Cross-Partition Reduction Step::
a: is a TUPLE that holds: (runningSum, runningCount).
b: is a TUPLE that holds: (nextPartitionsSum, nextPartitionsCount).
'''