In [2]:
def my_function(x,y):
    pass

In [3]:
def my_sq(x):
    return x**2

In [4]:
my_sq(2)

4

In [5]:
my_sq(4)

16

In [8]:
# returns error if a false. used for unit testing

assert my_sq(4) == 16

In [9]:
def avg_2(x,y):
    return (x+y)/2

In [11]:
avg_2(10,12)

11.0

In [17]:
import pandas as pd

In [18]:
#manually create a dataframe
test = pd.DataFrame({
    'a' : [10,20,20],
    'b' : [20,30,40]  
})

In [20]:
test

Unnamed: 0,a,b
0,10,20
1,20,30
2,20,40


In [22]:
test['a']**2

0    100
1    400
2    400
Name: a, dtype: int64

In [25]:
# applies a function to a specific column
test['a'].apply(my_sq)

0    100
1    400
2    400
Name: a, dtype: int64

In [26]:
def my_exp(x, e):
    return x ** e

In [27]:
my_exp(2,10)

1024

In [28]:
# apply a function to a series within a dataframe
test['a'].apply(my_exp,e=4)

0     10000
1    160000
2    160000
Name: a, dtype: int64

In [31]:
def print_me(x):
    print(x)

In [33]:
# apply works by column --> applies to column a and b. Nothing is returned so there is a `none` for column a and b that is returned
test.apply(print_me)

0    10
1    20
2    20
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [36]:
# neeed to re-right as pandas
#def avg_3(x,y,z):
#    return (x+y+z) / 3

def avg_3_apply(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x+y+z)/3

In [39]:
test.apply(avg_3_apply, axis='columns')

IndexError: index 2 is out of bounds for axis 0 with size 2

In [38]:
test

Unnamed: 0,a,b
0,10,20
1,20,30
2,20,40


In [40]:
test['a'].mean()

16.666666666666668

In [43]:
def avg_2_mod(x,y):
    if (x == 20):
        return np.NaN #np.NAN np.nan
    else:
        return (x+y) / 2

In [44]:
test

Unnamed: 0,a,b
0,10,20
1,20,30
2,20,40


In [45]:
#how to do row by column computations (e.g., a0 + b0 = c0)

import numpy as np

In [46]:
#takes a function and makes it possible to run as a vector analysis on a dataframe

avg_2_mod_vec = np.vectorize(avg_2_mod)

In [49]:
avg_2_mod_vec(test['a'], test['b'])

array([15., nan, nan])

In [52]:
#shorthand to vectorize a function instead of resaving

@np.vectorize
def avg_2_mod(x,y):
    if (x == 20):
        return np.NaN #np.NAN np.nan
    else:
        return (x+y) / 2

In [51]:
avg_2_mod(test['a'], test['b'])

array([15., nan, nan])

In [53]:
import numba

In [54]:
#numba makes calculations with numpy faster

@numba.vectorize

def avg_2_mod_numba(x,y):
    if (x == 20):
        return np.NaN #np.NAN np.nan
    else:
        return (x+y) / 2

In [56]:
#only expects numpy arrays --> doesn't understand pandas arrays. Just numeric computations.

avg_2_mod_numba(test['a'].values, test['b'].values)

array([15., nan, nan])

In [64]:
#time speed of different functions depending on what you're doing

In [60]:
%%timeit

avg_2(test['a'],test['b'])

40.9 µs ± 2.82 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [61]:
%%timeit

avg_2_mod(test['a'],test['b'])

11 µs ± 239 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [63]:
%%timeit

avg_2_mod_numba(test['a'].values,test['b'].values)

2.36 µs ± 124 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
