In [9]:
import pandas as pd
import numpy as np
from itertools import product

from typing import List, Tuple, Union


In [2]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
                         'height': [9.1, 6.0, 9.5, 34.0],
                         'weight': [7.9, 7.5, 9.9, 198.0]})
                         
animals.groupby("kind").agg(
        min_height=('height', 'min'),
        max_height=('height', 'max'),
        average_weight=('weight', np.mean),
    )     

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [5]:
import random

dct = {
    "A": [random.choice(["animal", "vegetable", "mineral"]) for _ in range(10)],
    "B": list(range(10)),
    "C": [random.choice(["alice", "bob", "charlie"]) for _ in range(10)],
}

In [9]:
df = pd.DataFrame(
    {
        "A": [
            "animal",
            "vegetable",
            "vegetable",
            "vegetable",
            "vegetable",
            "mineral",
            "vegetable",
            "animal",
            "animal",
            "animal",
        ],
        "B": [0, 1, 2, 3, np.nan, 5, 6, 7, 8, 9],
        "C": [
            "charlie",
            "bob",
            "alice",
            "alice",
            "charlie",
            "alice",
            "bob",
            "charlie",
            "bob",
            "bob",
        ],
    }
)

In [10]:
df.groupby("A")["B"].mean()

A
animal       6.0
mineral      5.0
vegetable    3.0
Name: B, dtype: float64

In [6]:
dct

{'A': ['animal',
  'vegetable',
  'vegetable',
  'vegetable',
  'vegetable',
  'mineral',
  'vegetable',
  'animal',
  'animal',
  'animal'],
 'B': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 'C': ['charlie',
  'bob',
  'alice',
  'alice',
  'charlie',
  'alice',
  'bob',
  'charlie',
  'bob',
  'bob']}

# Functional Programming for Data Scientists

## Why FP?

### Managing State

Chances are, you spend time in Notebooks

One of the biggest drawbacks to notebooks is managing Hidden State

Functional Programming habits make it a lot easier to manage state!

Hrm, this was working last night!  How come it's not working when I restarted the notebook?

In [11]:
x = 5 
x + y

NameError: name 'y' is not defined

In [12]:
y = 2

### Lower Cognitive Load

>"There are only two hard things in Computer Science: cache invalidation and naming things." 
-- Phil Karlton

Anonymous Functions

Piping

### Serialization

Easier parallelization

Easier Caching

## How to incorporate FP

### Method Chaining

Stolen from this fantastic article: https://tomaugspurger.github.io/method-chaining.html

In [None]:
n_hill = went_up(jack_jill, 'hill')
with_water = fetch(on_hill, 'water')
fallen = fell_down(with_water, 'jack')
broken = broke(fallen, 'jack')
after = tmple_after(broken, 'jill')

vs

In [None]:
jack_jill = JackAndJill()
(jack_jill.went_up('hill')
    .fetch('water')
    .fell_down('jack')
    .broke('crown')
    .tumble_after('jill')
)

### toolz
Based on the Clojure library

Composing functions

Declarative operations on sequences & dictionaries

In [None]:
def format_new_rows(new_rows: List[gspread.models.Cell]) -> List[List[str]]:
    formatted_rows = tz.functoolz.thread_last(
        new_rows,
        (map, lambda x: x.value),
        (tz.itertoolz.partition_all, 12),
        (filter, lambda x: any(y!='' for y in x)),
        (map, list),
        list,
    )
    return formatted_rows

### Building an interface for yourself

In [3]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
                         'height': [9.1, 6.0, 9.5, 34.0],
                         'weight': [7.9, 7.5, 9.9, 198.0]})
                         

In [7]:
animals.groupby("kind").agg(
        min_height=('height', 'min'),
        max_height=('height', 'max'),
        average_weight=('weight', np.mean),
    )     

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [None]:
animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'],
                         'height': [9.1, 6.0, 9.5, 34.0],
                         'weight': [7.9, 7.5, 9.9, 198.0]})
                         

In [6]:
def add_multiple_aggs(
    df: pd.DataFrame, groupbys: Union[str, List[str]], aggs: List[Tuple[str, str]]
) -> pd.DataFrame:
    return df.groupby(groupbys).agg(**{"_".join(agg): agg for agg in aggs})

animals.pipe(
    add_multiple_aggs,
    "kind",
    [("height", "min"), ("height", "max"), ("weight", "mean")],
)

Unnamed: 0_level_0,height_min,height_max,weight_mean
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [10]:
animals.pipe(
    add_multiple_aggs, "kind", list(product(["height", "weight"], ["mean", "std"]))
)

Unnamed: 0_level_0,height_mean,height_std,weight_mean,weight_std
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cat,9.3,0.282843,8.9,1.414214
dog,20.0,19.79899,102.75,134.703842


# Summary

## Why?

Managing state in Notebooks

Serialization

Cognitive Load

## How?

Method Chaining

`toolz`

`itertools`

## Thanks!

Blog: https://hackersandslackers.com/

Twitter: @MattAlhonte

Email: mattalhonte@gmail.com