In [1]:
# writing a python function

In [1]:
def my_square(x):
    return x ** 2

In [2]:
my_square(4)

16

In [3]:
assert my_square(4) == 16

In [4]:
# applying functions to dataframes

In [5]:
import pandas as pd
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})

In [7]:
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [8]:
df['a'].apply(my_square)

0    100
1    400
2    900
Name: a, dtype: int64

In [9]:
def my_exp(x, e):
    return x ** e

In [10]:
df['a'].apply(my_exp, e=4)

0     10000
1    160000
2    810000
Name: a, dtype: int64

In [11]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [14]:
import numpy as np

In [15]:
def avg_3_apply(col):
    return np.mean(col)

In [16]:
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

In [17]:
#df.apply(avg_3_apply, axis='columns') ## this will give an error because this works row by row

In [18]:
def avg_2_mod(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [21]:
# vectorized functions

In [23]:
avg_2_mod_vec = np.vectorize(avg_2_mod)

In [24]:
avg_2_mod_vec(df['a'], df['b'])

array([15., nan, 35.])

In [25]:
# shortcut version (decorators)

In [26]:
@np.vectorize
def v_avg_2_mod_v3(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [27]:
v_avg_2_mod_v3(df['a'], df['b'])

array([15., nan, 35.])

In [28]:
# for very mathy stuff, this makes numpy faster
import numba

In [29]:
@numba.vectorize
def v_avg_2_mod_numba(x, y):
    if (x == 20):
        return np.NaN
    else:
        return (x + y) / 2

In [30]:
v_avg_2_mod_numba(df['a'].values, df['b'].values)

array([15., nan, 35.])

In [31]:
# magics

In [35]:
%%timeit
avg_2_mod_vec(df['a'], df['b'])

124 µs ± 3.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [36]:
%%timeit
v_avg_2_mod_numba(df['a'], df['b'])

110 µs ± 9.16 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
