### More about Python: Functions, syntax and structures

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# we define functions like this
# NB note the indenation - this is python syntax
def fnA(n): # come up with a name for the function
    answer = n*n # n squared
    return answer
# the function block stops when we stop indenting
'hello'

'hello'

In [6]:
# use the function
x = 3
s = fnA(x)
s # 9

9

In [13]:
# we can apply any function to the members of a collection (such as a DataFrame)
np.random.seed(0) # give us reproduible values
values = np.random.randn(4,3)
cols = ['b', 'c', 'e']
rows = ['ch1', 'ch2', 'ch3', 'ch4']
df = pd.DataFrame(values, columns=cols, index=rows)
np.abs(df) # abs means the 'absolute' value (i.e. remove -)

Unnamed: 0,b,c,e
ch1,1.764052,0.400157,0.978738
ch2,2.240893,1.867558,0.977278
ch3,0.950088,0.151357,0.103219
ch4,0.410599,0.144044,1.454274


In [18]:
# challenge: find the DIFFERENCE between min and max for every row
def findDiff(x):
    return x.max()-x.min()
# apply the function to our (asolute) dataframe
np.abs(df).apply( findDiff, axis=1 ) # NB no brackets after the function name


ch1    1.363895
ch2    1.263615
ch3    0.846870
ch4    1.310230
dtype: float64

In [20]:
# a more complex function
def clearFn(x):
    """Here we return the difference betwen the min and max"""
    facets = [ x.min(), x.max() ]
    idx    = ['min', 'max']
    return pd.Series(facets, index=idx)
# apply the function to our ORIGINAL df (no abs)
df.apply(clearFn)  

Unnamed: 0,b,c,e
min,0.410599,-0.151357,-0.977278
max,2.240893,1.867558,1.454274


### Applying 'sorts' to data

In [26]:
# Python has a 'range' generator
values = range(4) # generate values 0, 1, 2, 3 (stop before 4)
idx = ['d', 'a', 'c', 'b']
m = pd.Series(values, index=idx)
m.sort_index()
m.sort_values(ascending=False) # defaults to ascending=True

b    3
c    2
a    1
d    0
dtype: int64

In [45]:
# now do this with a df (which is a collection of series)
# Numpy has 'arange' which is an array-range
data = np.arange(0, 12) # start at 0, stop-before 12, step...
d = data.reshape(3,4) # or ((3,4))
d
idx = ['three', 'one', 'two']
cols = ['d', 'a', 'b', 'c']
df = pd.DataFrame(d, index=idx, columns=cols)
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7
two,8,9,10,11


In [50]:
# just like sorting a sries, we can sort a data frame
df.sort_index()
# to sort the values, we MUST specify which colum
df.sort_values(by='c', ascending=False)

Unnamed: 0,d,a,b,c
two,8,9,10,11
one,4,5,6,7
three,0,1,2,3


In [53]:
# we can sort ACROSS the rows
df.sort_index(axis=1)
df.sort_values(axis=1, by='two')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7
two,8,9,10,11
