# Map-reduce type operations

In [70]:
import pprint
from functools import partial
import numpy as np

In [71]:
data = np.random.randn(5)
h = { 'x': 1, 'y': 10, 'flim': 100, 'flam': 1000, 'foo': 1e5 }

def complicated_function(field, x, y, flim, flam, foo):
    return np.sum((field, x, y, flim, flam)) / foo

In [72]:
# door 1
try:
    new_data = map(complicated_function, data) # doesn't work!
    list(new_data)
except Exception as e:
    print("Oops! That didn't work.")

Oops! That didn't work.


In [73]:
# door 2
new_data = map(lambda a: complicated_function(a, **h), data) # ugly!
list(new_data)

[0.011117855565115006,
 0.011126473422785671,
 0.011090272343825841,
 0.011098345393296936,
 0.011126237298225599]

In [74]:
# door 3
does_the_right_thing = partial(complicated_function, **h)
new_data = map(does_the_right_thing, data) # works and it's readable!
list(new_data)

[0.011117855565115006,
 0.011126473422785671,
 0.011090272343825841,
 0.011098345393296936,
 0.011126237298225599]

# operators all have functional equivalents

In [75]:
from operator import add, mul, truediv, floordiv
from functools import reduce

In [76]:
print(reduce(lambda a, b: a * b, [1, 2, 3]))

6


In [77]:
print(reduce(mul, [1, 2, 3]))

6


# (And a quick reminder)

## lambda \*args: *f* (\*args) <==> *f*

In [78]:
def f(x):
    return x**2

print(list(map(lambda x: f(x), [1,2,3])))

[1, 4, 9]


In [79]:
print(list(map(f, [1,2,3])))

[1, 4, 9]


# Takeaways

### Lambdas should be readable
* if they aren't, what to use instead?

### Use partial to freeze parts of a function

### Use named functions (like operator.mul) whenever possible

# DIY NLP

In [80]:
import re
from collections import Counter

with open('t8.shakespeare.txt') as f:
    shakes = f.read()
sample = shakes[10300:10500]

In [81]:
def normalize(txt):
    return txt.lower()
def tokenize(txt):
    return re.sub(r"""(([;.!?,'":]|--)+)""",
             r" \1 ",
             txt)

In [82]:
print(sample)

  With means more blessed than my barren rhyme?
  Now stand you on the top of happy hours,
  And many maiden gardens yet unset,
  With virtuous wish would bear you living flowers,
  Much liker than yo


In [83]:
print(tokenize(sample))

  With means more blessed than my barren rhyme ? 
  Now stand you on the top of happy hours , 
  And many maiden gardens yet unset , 
  With virtuous wish would bear you living flowers , 
  Much liker than yo


In [84]:
print(normalize(tokenize(sample)))

  with means more blessed than my barren rhyme ? 
  now stand you on the top of happy hours , 
  and many maiden gardens yet unset , 
  with virtuous wish would bear you living flowers , 
  much liker than yo


In [85]:
print(normalize(tokenize(sample)).split()[:20])

['with', 'means', 'more', 'blessed', 'than', 'my', 'barren', 'rhyme', '?', 'now', 'stand', 'you', 'on', 'the', 'top', 'of', 'happy', 'hours', ',', 'and']


Tokenize everything:

In [86]:
all_tokens = normalize(tokenize(shakes)).split()

Fast vocabulary extraction:

In [87]:
vocabulary = [a[0] for a in Counter(all_tokens).most_common(5050)[50:]]
vocabulary[:10]

['our', 'lord', 'o', 'king', 'good', 'now', 'sir', 'from', 'come', 'at']

In [88]:
vocabulary[-10:]

['say-',
 'froward',
 'veil',
 'conspirators',
 'weaker',
 'preferr',
 'whispers',
 'its',
 'jerusalem',
 'dissemble']

In [89]:
with open("richardiiii.txt") as f:
    richard3 = f.read()
with open("hamlet.txt") as f:
    hamlet = f.read()
with open("asyoulikeit.txt") as f:
    asyoulikeit = f.read()

In [90]:
richard3_tokens = normalize(tokenize(richard3)).split()
hamlet_tokens = normalize(tokenize(hamlet)).split()
asyoulikeit_tokens = normalize(tokenize(asyoulikeit)).split()

In [91]:
def bow(tokens, vocabulary):
    counts = Counter(token for token in tokens)
    return np.array([counts[token] for token in vocabulary])

richard3_bow = bow(richard3_tokens, vocabulary)
richard3_bow[:5]

array([119, 242,  98, 267, 119])

In [92]:
hamlet_bow = bow(hamlet_tokens, vocabulary)
asyoulikeit_bow = bow(asyoulikeit_tokens, vocabulary)

In [93]:
def norm(a):
    return np.sqrt(np.sum(a ** 2))
def cosine_similarity(a, b):
    return (a @ b) / (norm(a) * norm(b))

print("Hamlet vs. As You Like It: ", cosine_similarity(hamlet_bow, asyoulikeit_bow))
print("Hamlet vs. Richard III: ", cosine_similarity(hamlet_bow, richard3_bow))

Hamlet vs. As You Like It:  0.499966873206
Hamlet vs. Richard III:  0.623168364997


In [94]:
print("As You Like It vs. Richard III: ", cosine_similarity(asyoulikeit_bow, richard3_bow))

As You Like It vs. Richard III:  0.586101120429


# Mini-batching!



In [95]:
from itertools import islice

shakespeare_lines = shakes.split('\n')
np.random.shuffle(shakespeare_lines)


How to get Shakespeare lines five at a time?

In [96]:
minibatch_size = 5

In [97]:
islice(shakespeare_lines, minibatch_size)

<itertools.islice at 0x10f048368>

In [98]:
list(islice(shakespeare_lines, minibatch_size))

['    Dare he presume to scorn us in this manner?',
 "    Were both extermin'd.",
 '  ANTONY. Fulvia is dead.',
 '     To match you where I hate; therefore beseech you',
 "  Come, go with us; we'll look to that anon."]

In [99]:
list(islice(shakespeare_lines, minibatch_size))

['    Dare he presume to scorn us in this manner?',
 "    Were both extermin'd.",
 '  ANTONY. Fulvia is dead.',
 '     To match you where I hate; therefore beseech you',
 "  Come, go with us; we'll look to that anon."]

Why always the same?

In [100]:
lines_iter = iter(shakespeare_lines)

### What is this doing?

In [101]:
list(islice(lines_iter, minibatch_size))

['    Dare he presume to scorn us in this manner?',
 "    Were both extermin'd.",
 '  ANTONY. Fulvia is dead.',
 '     To match you where I hate; therefore beseech you',
 "  Come, go with us; we'll look to that anon."]

In [102]:
list(islice(lines_iter, minibatch_size))

["    Thou gav'st me thine not to give back again.",
 "  POMPEY. We'll feast each other ere we part, and let's",
 "  CLOWN. I would you had been by the ship-side, to have help'd her;",
 '  TIMON. Mine honest friend,',
 '  The Ghost of Banquo and other Apparitions']

In [103]:
list(islice(lines_iter, minibatch_size))

['    The lustre in your eye, heaven in your cheek,',
 '    Will live as maids and widows. Come, away;',
 '    And his achievements of no less account.',
 "    Till that the conquering wine hath steep'd our sense",
 '    hanging of the thieves and so become a rare hangman.']

## So...
### If you have an iterator over individual examples, you have mini-batching in one line

### Even if they are out-of-core, infinite, being hand-entered in real time by an army of rats... 

# I Except

## Erring out your grievances

In [104]:
import sys
import importlib

In [105]:
import logging
importlib.reload(logging)
logging.basicConfig(level=logging.DEBUG)
logging.debug("PROBLEM")
print("PROBLEM")

DEBUG:root:PROBLEM


PROBLEM


## Why not print?

In [106]:
logger = logging.getLogger("main_logger")
logger.handlers = []
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
console.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(console)

try:
    1/0
except:
    logger.debug("Barf")

2016-10-14 18:30:58,852 - main_logger - DEBUG - Barf
DEBUG:main_logger:Barf


In [107]:
new_logger = logging.getLogger("main_logger.new_logger")

In [108]:
new_logger.handlers = []
new_logger.propagate = True

In [109]:
print(logger.name)
print(new_logger.name)

main_logger
main_logger.new_logger


In [110]:
logger.getChild("new_logger") is new_logger

True

In [111]:
new_console = logging.StreamHandler()
new_console.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - *** %(message)s ***'))
new_console.setLevel(logging.INFO)
new_logger.addHandler(new_console)

In [112]:
new_logger.debug("Barf")
new_logger.info("Useful message")

2016-10-14 18:30:58,929 - main_logger.new_logger - DEBUG - Barf
DEBUG:main_logger.new_logger:Barf
2016-10-14 18:30:58,936 - main_logger.new_logger - INFO - *** Useful message ***
2016-10-14 18:30:58,936 - main_logger.new_logger - INFO - Useful message
INFO:main_logger.new_logger:Useful message


In [113]:
new_logger.propagate = False
new_logger.debug("Barf")
new_logger.info("Useful message")

2016-10-14 18:30:58,946 - main_logger.new_logger - INFO - *** Useful message ***
