In [1]:
import pprint
from functools import partial
import numpy as np

# Functional programming with fewer lambdas

# `map`

In [2]:
a = range(10)
b = list(map(np.exp, a))
b

[1.0,
 2.7182818284590451,
 7.3890560989306504,
 20.085536923187668,
 54.598150033144236,
 148.4131591025766,
 403.42879349273511,
 1096.6331584284585,
 2980.9579870417283,
 8103.0839275753842]

Say we have some fictitious data and a complicated function to transform each one of the observations, which takes a lot of parameters. Since we can apply this function to each of the observations independently, we can **map** the function to the data, potentially in a parallel fashion.

Map operations can be done with `map` in Python, though it's usually better to do it some other way.

In [3]:
data = np.random.randn(5)

def complicated_function(field, x, y, flim, flam, foo):
    return np.sum((field, x, y, flim, flam)) / foo

Naively `map`ping the function to the data doesn't work because it has the wrong number of parameters (i.e., it has the wrong arity).

In [4]:
try:
    new_data = map(complicated_function, data) # doesn't work!
    list(new_data)
except TypeError as e:
    print("Oops! That didn't work:")
    print(e.args[0])

Oops! That didn't work:
complicated_function() missing 5 required positional arguments: 'x', 'y', 'flim', 'flam', and 'foo'


Creating an anonymous function (a "lambda") to encapsulate the call is a popular option, with what many believe are significant readability problems.

#### Ugly

In [5]:
settings = { 'x': 1, 'y': 10, 'flim': 100, 'flam': 1000, 'foo': 1e5 }

In [6]:
new_data = map(lambda a: complicated_function(a, **settings), data) 
list(new_data)

[0.011110338835955892,
 0.011110420525203427,
 0.011101023068607803,
 0.011111630074300927,
 0.011108426697075145]

#### Even uglier

In [7]:
new_data = map(lambda a: 
     complicated_function(a, x=1, y=10, flim=100, flam=1000, foo=1e5), data)
list(new_data)

[0.011110338835955892,
 0.011110420525203427,
 0.011101023068607803,
 0.011111630074300927,
 0.011108426697075145]

# A better way

If you need to "freeze" certain settings in place, `functools.partial` creates a new callable, here with only one argument:

In [8]:
# door 3
simple_function = partial(complicated_function, **settings)
new_data = map(simple_function, data) # works and it's readable!
list(new_data)

[0.011110338835955892,
 0.011110420525203427,
 0.011101023068607803,
 0.011111630074300927,
 0.011108426697075145]

# Operators all have named functional equivalents

Some of the most common functions you need to pass to distributed systems are binary arithmetic operations. The `operator` library has a number of named functions for just this purpose.

In [9]:
from operator import add, mul, truediv, floordiv
from functools import reduce

In [10]:
print(reduce(lambda a, b: a * b, [1, 2, 3]))

6


In [11]:
print(reduce(mul, [1, 2, 3]))

6


# (And a quick reminder)

## lambda \*args: *f* (\*args) <==> *f*

In [12]:
def f(x):
    return x**2

list(map(lambda x: f(x), [2,3,4]))

[4, 9, 16]

In [13]:
list(map(f, [2,3,4]))

[4, 9, 16]

# DIY natural language processing

In [14]:
with open('t8.shakespeare.txt') as f:
    shakes = f.read()
sample = shakes[10300:10500]

In [15]:
import re

def normalize(txt):
    return txt.lower()
def tokenize(txt):
    return re.sub(r"""(([;.!?,'":]|--)+)""",
             r" \1 ",
             txt)

# Tokenization

In [16]:
print(sample)

  With means more blessed than my barren rhyme?
  Now stand you on the top of happy hours,
  And many maiden gardens yet unset,
  With virtuous wish would bear you living flowers,
  Much liker than yo


In [17]:
print(tokenize(sample))

  With means more blessed than my barren rhyme ? 
  Now stand you on the top of happy hours , 
  And many maiden gardens yet unset , 
  With virtuous wish would bear you living flowers , 
  Much liker than yo


In [18]:
print(normalize(tokenize(sample)))

  with means more blessed than my barren rhyme ? 
  now stand you on the top of happy hours , 
  and many maiden gardens yet unset , 
  with virtuous wish would bear you living flowers , 
  much liker than yo


In [19]:
print(normalize(tokenize(sample)).split()[:20])

['with', 'means', 'more', 'blessed', 'than', 'my', 'barren', 'rhyme', '?', 'now', 'stand', 'you', 'on', 'the', 'top', 'of', 'happy', 'hours', ',', 'and']


Tokenize everything:

In [20]:
all_tokens = normalize(tokenize(shakes)).split()

# `collections.Counter`
Fast vocabulary extraction:

In [21]:
from collections import Counter

vocabulary = [a[0] for a in Counter(all_tokens).most_common(5050)[50:]]
print("Most common words:")
vocabulary[:10]

Most common words:


['our', 'lord', 'o', 'king', 'good', 'now', 'sir', 'from', 'come', 'at']

In [22]:
print("Least common:")
vocabulary[-10:]

Least common:


['son-',
 'favor',
 'proudly',
 'glories',
 'birnam',
 'folks',
 'britons',
 'inches',
 'pupil',
 'ending']

In [23]:
with open("richardiiii.txt") as f:
    richard3 = f.read()
with open("hamlet.txt") as f:
    hamlet = f.read()
with open("asyoulikeit.txt") as f:
    asyoulikeit = f.read()

In [24]:
richard3_tokens = normalize(tokenize(richard3)).split()
hamlet_tokens = normalize(tokenize(hamlet)).split()
asyoulikeit_tokens = normalize(tokenize(asyoulikeit)).split()

# Bags of words

In [63]:
def bow(tokens, vocabulary):
    counts = Counter(token for token in tokens)
    return np.array([counts[token] for token in vocabulary])

richard3_bow = bow(richard3_tokens, vocabulary)
richard3_bow[:5]

array([119, 242,  98, 267, 119])

In [64]:
hamlet_bow = bow(hamlet_tokens, vocabulary)
asyoulikeit_bow = bow(asyoulikeit_tokens, vocabulary)

In [27]:
def norm(a):
    return np.sqrt(np.sum(a ** 2))
def cosine_similarity(a, b):
    return (a @ b) / (norm(a) * norm(b))

print("Hamlet vs. As You Like It: ", cosine_similarity(hamlet_bow, asyoulikeit_bow))
print("Hamlet vs. Richard III: ", cosine_similarity(hamlet_bow, richard3_bow))

Hamlet vs. As You Like It:  0.499961805901
Hamlet vs. Richard III:  0.623157590376


In [28]:
print("As You Like It vs. Richard III: ", cosine_similarity(asyoulikeit_bow, richard3_bow))

As You Like It vs. Richard III:  0.586104685068


In [29]:
# Binary BOW digression 
hamlet_bow = (hamlet_bow != 0).astype(int)
asyoulikeit_bow = (asyoulikeit_bow != 0).astype(int)
richard3_bow = (richard3_bow != 0).astype(int)
print("Hamlet vs. As You Like It: ", cosine_similarity(hamlet_bow, asyoulikeit_bow))
print("Hamlet vs. Richard III: ", cosine_similarity(hamlet_bow, richard3_bow))
print("As You Like It vs. Richard III: ", cosine_similarity(asyoulikeit_bow, richard3_bow))

Hamlet vs. As You Like It:  0.642760985573
Hamlet vs. Richard III:  0.656169857146
As You Like It vs. Richard III:  0.632834763221


# Persistence

## `pickle`... right?

In [30]:
import shelve

In [31]:
with shelve.open("shakespeare") as f:
    f['richard iii'] = richard3
    f['hamlet'] = hamlet
    f['as you like it'] = asyoulikeit

In [32]:
with shelve.open("shakespeare", "r") as f:
    print(f['hamlet'][5100:5200])

  Dar'd to the combat; in which our valiant Hamlet
    (For so this side of our known world esteem'd


### cf. `h5py` for storing data, `redis-py` etc. for other key-value store interfaces

# Mini-batching


Another common task is taking a large number of observations and parceling them out a certain bit at a time, for example in order to maximally exploit limited GPU memory.

# `itertools.islice`

In [65]:
shakespeare_lines = shakes.split('\n')
np.random.shuffle(shakespeare_lines)
minibatch_size = 5

In [66]:
from itertools import islice
islice(shakespeare_lines, minibatch_size)

<itertools.islice at 0x10eb9eb88>

In [35]:
list(islice(shakespeare_lines, minibatch_size))

['    Yoke-fellow to his honour-owing wounds,',
 '    of hair; they call themselves Saltiers, and they have dance which',
 "    I do retort the 'solus' in thy bowels;",
 "    Have left me issueless; and your father's blest,",
 '']

In [36]:
list(islice(shakespeare_lines, minibatch_size))

['    Yoke-fellow to his honour-owing wounds,',
 '    of hair; they call themselves Saltiers, and they have dance which',
 "    I do retort the 'solus' in thy bowels;",
 "    Have left me issueless; and your father's blest,",
 '']

#### Pop quiz!

In [37]:
lines_iter = iter(shakespeare_lines)

### What is this doing?

In [38]:
list(islice(lines_iter, minibatch_size))

['    Yoke-fellow to his honour-owing wounds,',
 '    of hair; they call themselves Saltiers, and they have dance which',
 "    I do retort the 'solus' in thy bowels;",
 "    Have left me issueless; and your father's blest,",
 '']

In [39]:
list(islice(lines_iter, minibatch_size))

['  PETRUCHIO. Nay, I will win my wager better yet,',
 '',
 '    That whiles Verona by that name is known,',
 '    Farewell, my blood; which if to-day thou shed,',
 '  IAGO.                Should you do so, my lord,']

In [40]:
list(islice(lines_iter, minibatch_size))

['  IMOGEN. But that you shall not say I yield, being silent,',
 "    Take each man's censure, but reserve thy judgment.",
 '    I will not taint my mouth with.',
 '  Bene. Gallants, I am not as I have been.',
 '  Then gentle cheater urge not my amiss,']

## So...
### If you have an iterator over individual examples, you have basic mini-batching in one line

### Even if they are out-of-core, infinite, being hand-entered in real time by an army of rats... 

In [41]:
class MinibatchIterator:
    def __init__(self, x, batch_size):
        self.x = iter(x)
        self.batch_size = batch_size

    def __iter__(self):
        return self
    
    def __next__(self):
        return islice(self.x, self.batch_size)

# When something doesn't work

## (Or, Erring out your grievances)

In [None]:
import sys
import importlib
import logging
importlib.reload(logging)
logging.basicConfig(level=logging.DEBUG, format='(Root Logger) %(levelname)s - %(message)s')

In [69]:
logging.debug("PROBLEM")
print("PROBLEM")

PROBLEM


## Why not print?

A logger's **level** affects which messages it handles:

In [68]:
import sys
for level in (logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR, logging.CRITICAL):
    print("Logging level: {}".format(level), file=sys.stderr)
    # Set level of root logger
    logging.getLogger().setLevel(level)
    logging.debug("1")
    logging.info("2")
    logging.warning("3")
    logging.error("4")
    logging.critical("5")
    print("", file=sys.stderr)

Logging level: 10
(Root Logger) DEBUG - 1
(Root Logger) INFO - 2
(Root Logger) ERROR - 4
(Root Logger) CRITICAL - 5

Logging level: 20
(Root Logger) INFO - 2
(Root Logger) ERROR - 4
(Root Logger) CRITICAL - 5

Logging level: 30
(Root Logger) ERROR - 4
(Root Logger) CRITICAL - 5

Logging level: 40
(Root Logger) ERROR - 4
(Root Logger) CRITICAL - 5

Logging level: 50
(Root Logger) CRITICAL - 5



In [45]:
# Set level of root logger back to 10
logging.getLogger().setLevel(logging.DEBUG)

# Handlers and Formatters

In [46]:
logger = logging.getLogger("main_logger")
logger.handlers = []
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
console.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(console)

In [47]:
try:
    1/0
except:
    logger.debug("Barf")

2016-10-17 17:12:15,425 - main_logger - DEBUG - Barf
(Root Logger) DEBUG - Barf


### Levels belong to Handlers

# Ancestors and Children

In [48]:
new_logger = logging.getLogger("main_logger.new_logger")

In [49]:
new_logger.handlers = []
new_logger.propagate = True

In [50]:
print(logger.name)
print(new_logger.name)

main_logger
main_logger.new_logger


In [51]:
logger.getChild("new_logger") is new_logger

True

# Propagation

In [52]:
new_console = logging.StreamHandler()
new_console.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - *** %(message)s ***'))
new_console.setLevel(logging.INFO)
new_logger.addHandler(new_console)

In [53]:
new_logger.debug("Barf")

2016-10-17 17:12:15,475 - main_logger.new_logger - DEBUG - Barf
(Root Logger) DEBUG - Barf


In [54]:
new_logger.info("Useful message")

2016-10-17 17:12:15,495 - main_logger.new_logger - INFO - *** Useful message ***
2016-10-17 17:12:15,495 - main_logger.new_logger - INFO - Useful message
(Root Logger) INFO - Useful message


In [55]:
new_logger.propagate = False

In [56]:
new_logger.debug("Barf")

In [57]:
new_logger.info("Useful message")

2016-10-17 17:12:15,514 - main_logger.new_logger - INFO - *** Useful message ***


# Other handlers

In [58]:
logging.getLogger("file_logger").handlers=[]

In [59]:
formatter = logging.Formatter('%(asctime)s / %(name)s / %(levelname)s:  %(message)s')
file_handler = logging.FileHandler("log.txt", "w")
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)
logger = logging.getLogger("file_logger")
logger.addHandler(file_handler)

In [60]:
logger.debug("This won't show up")
logger.info("FYI")
logger.warning("This is an error")

(Root Logger) DEBUG - This won't show up
(Root Logger) INFO - FYI


In [61]:
with open("log.txt") as log:
    print(log.read())

2016-10-17 17:12:15,550 / file_logger / INFO:  FYI



## Logstash handlers

###  3rd-party library:  
```pip install python-logstash```

### Python:  
```import logstash
logger.addHandler(logstash.LogstashHandler(host, port, version=1))
```

In [62]:
import importlib
import pdb
import raise_error
importlib.reload(raise_error)
raise_error.a()

Exception: Error condition!