# Funtional Programming for Data Science
---
PyData Dublin, 30$^\mathsf{th}$ August 2018

Neal Ó Riain

# ``` $ whoami```
---

<img src="imgs/me.jpg" width="35%" align="right"> 
 
 * Former Astrophysicist (🔭, 🚀, 🌝)
 
 
<br> 
 
 
 * Current Data Scientist at Amazon.
 
 <br> 
 
 
 * ((Semi-) Pragmatic) Functional Programmer.

# Outline
---

* What is FP and why would I use it?
 
 
<br> 
 
* Some FP primitives in Python

<br> 
 
* Example



<center>
<H1> What is Functional Programming? <H1>
</center>

<center>
<H1> What is a <em>Function</em>? <H1>
</center>

```C
#include <stdio.h>

main()
{
        printf("hello, world\n");
}
```

<center>
<H1>Structured Programming<H1>
</center>

# Structured Data

![combine](imgs/data-structures.png)

# Structured Code

```C
static void release_callchain_buffers_rcu(struct rcu_head *head)
{
	struct callchain_cpus_entries *entries;
	int cpu;

	entries = container_of(head, struct callchain_cpus_entries, rcu_head);

	for_each_possible_cpu(cpu)
		kfree(entries->cpu_entries[cpu]);

	kfree(entries);
}

static void release_callchain_buffers(void)
{
	struct callchain_cpus_entries *entries;

	entries = callchain_cpus_entries;
	RCU_INIT_POINTER(callchain_cpus_entries, NULL);
	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}

```

<center>
<H1>Modularity!<H1>
</center>

<center>
Re-usable
</center>

<center>
Easier to code
</center>

<center>
Debug-able
</center>

<center>
<H1> What is <u>Functional</u> Programming? <H1>
</center>

<img src="imgs/glue.jpg" width="100%">

# Modularity
---

* Purity

<br>

* Laziness

<br>

* Higher Order Fucntions


<center>
<h2>FP Ideas in Python</h2>
</center>

In [5]:
def cumsum(lst):    
    c = lst[0]
    out = [c]
    for item in lst[1:]:
        c += item
        out.append(c)
        
    return out
    
cumsum([1, 2, 3, 4])

[1, 3, 6, 10]

In [2]:
cumsum(['1', '2', '3', '4'])

['1', '12', '123', '1234']

In [3]:
from typing import List

Vector = List[int]

def cumsum(lst: Vector):    
    c = lst[0]
    out = [c]
    for item in lst[1:]:
        c += item
        out.append(c)
        
    return out

<center>
    <H1> Higher Order Functions</H1>
</center>

In [7]:
names = ['alice', 'bob', 'eve'] 

capitalised = []
for name in names:
    capitalised.append(str.capitalize(name))

print(capitalised)

['Alice', 'Bob', 'Eve']


```python
data = [values] 

output = []
for value in data:
        output.append(function(value))
```

<center>
<pre>loop_and_append(function, data)</pre>
</center>

<center>
$g(f,\;[x_1, \dots, x_n]) \rightarrow [f(x_1), \dots, f(x_n)]$
</center>

```python
map(str.capitalize, ['alice', 'bob', 'eve'])
```

# Reductions


<center>
$g(f,\; [x_1, x_2, x_3],\;i) \rightarrow f(i,\;f(x_1,\;f(x_2,\;x_3)))$
</center>

In [10]:
from functools import reduce
from operator import add

reduce(add, [1, 2, 3, 4], 0) #sum

10

In [25]:
from functools import reduce
from operator import mul

reduce(mul, [1, 2, 3, 4], 1) #factorial

24

In [26]:
from functools import reduce
from operator import mul, add

reduce(add, map(mul, [1, 2, 3, 4], [2, 3, 4, 5])) #dot product

40

# Filtering

```python
data = [values] 

output = []
for value in data:
    if predicate(value):
        output.append(value)
```

In [29]:

list(filter(lambda x: x > 10, [2, 57, 41, 5, 92, 84, 2.3]))

[57, 41, 92, 84]

# Currying

<center>
$f(x, y, x) \rightarrow f(x)(y)(z)$
</center>

In [30]:
from toolz import curry

def add_and_scale(x, y, z):
    return (x + y) * z

add_and_scale = curry(add_and_scale)

add_10yz = add_and_scale(10)

add_10_20_z = add_10yz(20)

print(add_10_20_z(2))

60


<center>
<h2>Laziness</h2>
</center>

```python
for char in 'python':
    
for value in [1, 2, 3, 4]:

for key in {'A': 1, 'B': 2}:
```


In [44]:
def numbers():
    x = 0 
    while True:
        yield x
        x += 1
        
n = numbers()

print(next(n))
print(next(n))
print(next(n))
print(next(n))

0
1
2
3


In [54]:
def newton(n, a=2, steps=10):
    for i in range(steps):
        a = (a + n/a)/2
    return a

newton(10)

3.162277660168379

In [55]:
from toolz import curry, nth, iterate

def newton_f(n, guess=2, step=10):
    next_step = lambda n, a: (a + n/a)/2
    next10 = curry(next_step, n)
    return nth(step, iterate(next10, guess))

newton_f(10)

3.162277660168379

<center>
<H1>Example</H1>
</center>

In [59]:
!ls lyrics/billboard/

03_bonnie__clyde.txt
0_to_100__the_catch_up.txt
100_pure_love.txt
100_years.txt
123.txt
18_and_life.txt
1979.txt
1999.txt
19_somethin.txt
19th_nervous_breakdown.txt
1_2_3_4_sumpin_new.txt
1_2_3_red_light.txt
1_2_step.txt
1_thing.txt
1st_of_tha_month.txt
21_guns.txt
21_questions.txt
22.txt
23.txt
247.txt
25_or_6_to_4.txt
2_become_1.txt
2_legit_2_quit.txt
2_on.txt
2_step.txt
3.txt
3_am_eternal.txt
4_minutes.txt
4_seasons_of_loneliness.txt
50_ways_to_leave_your_lover.txt
50_ways_to_say_goodbye.txt
5_oclock.txt
6345789_soulsville_usa.txt
65_love_affair.txt
679.txt
6_foot_7_foot.txt
7.txt
711.txt
7_days.txt
7_things.txt
808.txt
8675309jenny.txt
96_tears.txt
986.txt
99_luftballons.txt
9_to_5.txt
TRUE.txt
a_bay_bay.txt
a_beautiful_morning.txt
a_boy_named_sue.txt
a_cowboys_work_is_never_done.txt
a_different_corner.txt
a_fifth_of_beethoven.txt
a_groovy_kind_of_love.txt
a_hazy_shade_of_winter.txt
a_horse_with_no_name.txt
a_lesson_in_leavin.

In [60]:
from toolz.curried import *
from toolz.dicttoolz import merge_with
from string import punctuation
from collections import defaultdict
from glob import glob

punc = str.maketrans({p:None for p in punctuation})

In [67]:
def wordcount_imp(directory):

        d = defaultdict(int)
        for f in glob(directory):
                for line in open(f, 'r'):
                        line = line.split()
                        line = [w.lower().translate(punc) for w in line]
                        for s in line:
                                d[s] += 1

        return {k:d[k] for k in d.keys() if len(k) >= 4}
    
words = wordcount_imp('lyrics/billboard/*')
print(words)



In [72]:
def stem(word: str) -> str:
        return word.lower().translate(punc)

def drop_word(word: str) -> bool:
        return len(word) >= 4
    
workflow = (glob,
            mapcat(open),
            mapcat(str.split),
            map(stem),
            frequencies,
            keyfilter(drop_word))

wordcount_f = compose(*reversed(workflow))
words = wordcount_f('lyrics/billboard/*')
print(words)



In [74]:
def normalise(d: dict, sign: int = 1) -> dict:
        s = sum(d.values())
        return {k:sign * v/s for k, v in d.items()}
    
billboard = wordcount('lyrics/billboard/*')
dylan = wordcount('lyrics/dylan/*')

m = merge_with(sum, normalise(dylan),
                    normalise(billboard, sign=-1))


In [76]:
def col_print(l, cols=5, width=12):

        group = zip(*[l[i::cols] for i in range(cols)])
        for row in group:
                print(''.join(word.ljust(width) for word in row))


In [77]:
print('\nDylan:')
col_print(sorted(m, key=m.get)[-50:])
print('\nBillboard:')
col_print(sorted(m, key=m.get)[:50])


Dylan:
walkin      young       land        came        babe        
father      some        knows       while       bound       
hand        more        gone        many        woman       
blues       poor        wind        looked      them        
long        train       river       broken      home        
behind      hard        says        might       mama        
people      lonesome    theres      town        door        
will        must        dead        road        their       
went        been        said        lord        where       
there       from        down        they        well        

Billboard:
love        yeah        baby        know        want        
cause       dont        like        girl        wanna       
make        this        what        need        right       
your        take        feel        youre       just        
give        life        cant        gotta       lets        
thats       dance       really      keep        stop        
real