## Functions and Mapping

In [1]:
import numpy as np
import pandas as pd

In [5]:
values = np.random.randn(4,3) # an array 4 rows by 3 columns
values
cols = list('bce')
cols

['b', 'c', 'e']

In [20]:
df = pd.DataFrame(values, columns=cols)
df_positive = np.abs(df) # return the absolute values for each member in the dataframe
df_positive

Unnamed: 0,b,c,e
0,0.452677,1.815673,0.384085
1,1.100217,1.681948,2.09807
2,0.55318,1.08816,0.510699
3,0.826797,0.755027,1.053205


In [22]:
# we need to see the dataframe for reference
df

Unnamed: 0,b,c,e
0,0.452677,1.815673,0.384085
1,1.100217,1.681948,2.09807
2,0.55318,-1.08816,0.510699
3,0.826797,-0.755027,1.053205


In [21]:
# applying calculations across large data sets is extremely performant with numpy and pandas
# this is because np and pd are written in 'c', so they avoid the GIL
fn = lambda x:x.max()-x.min() # return the difference between the max and min values
df.apply(fn)

b    0.647540
c    2.903833
e    1.713985
dtype: float64

In [23]:
def fnB(x): # we pass a positional argument x
    facets = [x.min(), x.max()]
    idx = ['min', 'max']
    return pd.Series(facets, index=idx)
results_df = df.apply(fnB) #the dataframe is injected as the positional argument 'x'
results_df

Unnamed: 0,b,c,e
min,0.452677,-1.08816,0.384085
max,1.100217,1.815673,2.09807


### Sorting Data

In [28]:
m = pd.Series( range(4), index=['d','a','b','c'] )
m.sort_index() # use the index to sort the values
m.sort_values(ascending=False)

c    3
b    2
a    1
d    0
dtype: int64

In [32]:
# similar when useing DataFrame
data = np.arange(12).reshape( (3,4) )
i = ['three', 'one', 'two']
c = ['d', 'a', 'b', 'c']
df = pd.DataFrame(data, index=i, columns=c)
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7
two,8,9,10,11


In [36]:
df.sort_values(by='c', ascending=False)
df.sort_index() # sort by index. be it alphabetic or numeric (or date)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3
two,8,9,10,11


In [41]:
# we can also sortt ACROSS the df (i.e by row)
df.sort_index(axis=1) # here we sort by the row column heading

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4
two,9,10,11,8


In [42]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5
two,8,11,10,9


In [45]:
df.sort_values(axis=1, by='two', ascending=False) # NB axis=0 is the default

Unnamed: 0,c,b,a,d
three,3,2,1,0
one,7,6,5,4
two,11,10,9,8


In [47]:
df # remember - the original dataframe has not changed

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7
two,8,9,10,11


In [49]:
df.sort_values(by=['b', 'a'], ascending=False, inplace=True) # this will mutate the original df
df # the changes have been persisted in the original dat

Unnamed: 0,d,a,b,c
two,8,9,10,11
one,4,5,6,7
three,0,1,2,3
