## idiomatic python
also known as 'pythonic' code, the idea that there may be many ways to solve a problem but some are more elegant and make use of the languages strenghts. In the pandas world these might be known as pandorable

In [2]:
import pandas as pd
import numpy as np
import timeit

# some census data
df = pd.read_csv('../resources/week-3/datasets/census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


## Method chaining
method chaining is one such idiom that allow you to combine several steps into one.
It is not necessarily faster, but often is easier to read


In [3]:

#starting the line with a parentheses tells python that this is going to span several lines
(df.where(df['SUMLEV'] == 50)
    .dropna()
    .set_index(['STATE', 'CTYNAME'])
    .rename(columns = {'ESTIMATESBASE2010':'Estimates Base 2010'}))
df[5:].head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411
6,50,3,6,1,11,Alabama,Bullock County,10914,10915,10887,...,-30.953709,-5.180127,-1.130263,14.35429,-16.167247,-29.001673,-2.825524,1.507017,17.24379,-13.193961
7,50,3,6,1,13,Alabama,Butler County,20947,20946,20944,...,-14.032727,-11.684234,-5.655413,1.085428,-6.529805,-13.936612,-11.586865,-5.557058,1.184103,-6.430868
8,50,3,6,1,15,Alabama,Calhoun County,118572,118586,118437,...,-6.15567,-4.611706,-5.524649,-4.463211,-3.376322,-5.791579,-4.092677,-5.062836,-3.912834,-2.806406
9,50,3,6,1,17,Alabama,Chambers County,34215,34170,34098,...,-2.731639,3.849092,2.872721,-2.287222,1.349468,-1.821092,4.701181,3.781439,-1.290228,2.346901


In [4]:
df.iloc[:3, :6]
df.sample(n=4, axis=0)



Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1874,50,1,2,36,27,New York,Dutchess County,297488,297448,297745,...,-2.315362,-7.267305,-5.650982,-5.662829,-6.640389,0.577163,-4.412772,-2.524262,-2.052776,-3.02972
1362,50,2,4,27,49,Minnesota,Goodhue County,46183,46183,46199,...,-1.492295,0.215801,-0.495706,-1.746254,-0.840427,-1.038118,0.625823,-0.064657,-1.250404,-0.34479
587,50,4,8,16,47,Idaho,Gooding County,15464,15464,15478,...,-12.640586,-14.565643,-11.451512,-8.262005,2.500082,-11.668233,-12.736773,-9.148047,-5.485971,5.263331
3187,50,4,8,56,35,Wyoming,Sublette County,10247,10247,10244,...,-23.741784,15.272374,-40.870074,-16.596273,-22.8709,-21.092907,16.828794,-39.211861,-14.409938,-20.664059


## Apply function
'Map' is a big part of functional programming model in Python, and there is a similar funtion in Pandas call 'applymap' that operates on each cell of a dataframe. But probably more useful with dataframes is the function 'apply' that applys across each row of a dataframe. Heres an example using min max


In [5]:
# function to calculate the min and max population estimates across a row of the census data and create new columns to hold that calculation
def min_max(row):
    data = row[['POPESTIMATE2010',
                'POPESTIMATE2011', 
                'POPESTIMATE2012', 
                'POPESTIMATE2013',
                'POPESTIMATE2014', 
                'POPESTIMATE2015']]
    return pd.Series({'min': np.min(data), 'max': np.max(data)})


In [6]:
# apply the function to each row of data (applying it across the columns)
df.apply(min_max, axis='columns').head()

Unnamed: 0,min,max
0,4785161,4858979
1,54660,55347
2,183193,203709
3,26489,27341
4,22512,22861


In [7]:
# instead of retunring just the min and max of the row, you can append this value to the existing dataframe 
# with a small change in the min_max function
def appnd_min_max(row):
    data = row[['POPESTIMATE2010',
                'POPESTIMATE2011', 
                'POPESTIMATE2012', 
                'POPESTIMATE2013',
                'POPESTIMATE2014', 
                'POPESTIMATE2015']]
    row['min'] = np.min(data)
    row['max'] = np.max(data)
    return row

df.apply(appnd_min_max, axis='columns').head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,min,max
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594,4785161,4858979
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333,54660,55347
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499,183193,203709
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299,26489,27341
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861,22512,22861


## 'Apply' using lambda functions
typically apply is not used with large functions as in the previous example. More often it is used in a lambda function

In [8]:
# the following will apply the lambda function and return the result for each row
pop_est_row=['POPESTIMATE2010',
             'POPESTIMATE2011', 
             'POPESTIMATE2012', 
             'POPESTIMATE2013',
             'POPESTIMATE2014', 
             'POPESTIMATE2015']
df.apply(lambda x: np.max(x[pop_est_row]), axis=1).head()

0    4858979
1      55347
2     203709
3      27341
4      22861
dtype: int64

In [19]:
# extra credit, append this value to the row...come back to this
# df.apply(lambda x: np.max(x[pop_est_row]), axis=1).head()
# df['max'] = df[pop_est_row].apply(lambda x: np.max(x[pop_est_row]), axis=1)

# one possibility, the column name can be added to the column list
pop_est_row=['POPESTIMATE2010',
             'POPESTIMATE2011', 
             'POPESTIMATE2012', 
             'POPESTIMATE2013',
             'POPESTIMATE2014', 
             'POPESTIMATE2015']
print (pop_est_row)
pop_est_row.append('appended')
print (pop_est_row)


['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015']
['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015', 'appended']
