# Pandas DataFrames

In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('seaborn')



## Set path to data
path = os.getcwd() + '\\data\\'

## <a id="toc">Table of Contents</a>
> 1. [Creating Pandas DataFrame Objects](#1)
    1. Create a DataFrame
    2. Accessing Rows
    3. Creating a DataFrame from numpy
> 2. [DataFrame Axes (1 and 0)](#2)
    1. Index (rows, 0)
    2. Columns (1)
> 3. [Creating a Clean DataFrame Function](#3)
> 4. [Math Methods in DataFrames](#4)
    1. Index Alignment
> 5. [Looping and Aggregation](#5)
    1. Looping
    2. Aggregations
    3. Multiple Aggregations using Agg
    4. Describe
    5. The .apply method vs .pipe method
> 6. [Creating and Updating Columns](#6)

### 1. <a id=1> Creating Pandas DataFrame Objects </a>
[Back to contents](#toc)

#### a. Create a DataFrame

In [38]:
df = pd.DataFrame({'growth':[.5, .9, 1.2],
                   'Name' : ['Paul', 'George', 'Kristi']})

In [39]:
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.9,George
2,1.2,Kristi


#### b. Accessing Rows

Access by row using location indeexing .iloc

In [40]:
df.iloc[2]

growth       1.2
Name      Kristi
Name: 2, dtype: object

Indexing by column

In [41]:
df['Name']

0      Paul
1    George
2    Kristi
Name: Name, dtype: object

#### c. Creating a DataFrame from numpy

In [42]:
np.random.seed(24)

## 10 rows and 3 columns
values = np.random.randn(10,3)
values

array([[ 1.32921217, -0.77003345, -0.31628036],
       [-0.99081039, -1.07081626, -1.43871328],
       [ 0.56441685,  0.29572189, -1.62640423],
       [ 0.2195652 ,  0.6788048 ,  1.88927273],
       [ 0.9615384 ,  0.1040112 , -0.48116532],
       [ 0.85022853,  1.45342467,  1.05773744],
       [ 0.16556161,  0.51501838, -1.33693569],
       [ 0.56286114,  1.39285483, -0.06332798],
       [ 0.12166836,  1.20760254, -0.00204021],
       [ 1.62779574,  0.35449279,  1.03752763]])

In [43]:
df = pd.DataFrame(values, columns = ['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1.329212,-0.770033,-0.31628
1,-0.99081,-1.070816,-1.438713
2,0.564417,0.295722,-1.626404
3,0.219565,0.678805,1.889273
4,0.961538,0.104011,-0.481165
5,0.850229,1.453425,1.057737
6,0.165562,0.515018,-1.336936
7,0.562861,1.392855,-0.063328
8,0.121668,1.207603,-0.00204
9,1.627796,0.354493,1.037528


### 2. <a id=2> DataFrame Axes (1 and 0) </a>
[Back to contents](#toc)

In [44]:
df

Unnamed: 0,a,b,c
0,1.329212,-0.770033,-0.31628
1,-0.99081,-1.070816,-1.438713
2,0.564417,0.295722,-1.626404
3,0.219565,0.678805,1.889273
4,0.961538,0.104011,-0.481165
5,0.850229,1.453425,1.057737
6,0.165562,0.515018,-1.336936
7,0.562861,1.392855,-0.063328
8,0.121668,1.207603,-0.00204
9,1.627796,0.354493,1.037528


Unlike a series, which has one axis, there are two axes for a dataframe. They are commonly referred
to as axis 0 and 1, or the "index" (or 'rows') axis and the "columns" axis respectively:

In [45]:
df.axes

[RangeIndex(start=0, stop=10, step=1), Index(['a', 'b', 'c'], dtype='object')]

#### a. Index (rows, 0)
As many operations take an axis parameter, it is important to remember that 0 is the index and
1 is the columns:

In [46]:
df.axes[0]

RangeIndex(start=0, stop=10, step=1)

#### b. Columns (1)

In [47]:
df.axes[1]

Index(['a', 'b', 'c'], dtype='object')

For example, we can sum a dataframe along the index or along the columns using the labels 0
and 1:

##### Sum each column

In [48]:
df.sum(axis=0)

a    5.412038
b    4.161081
c   -1.280329
dtype: float64

In [49]:
df.sum(axis = 'index')

a    5.412038
b    4.161081
c   -1.280329
dtype: float64

##### Sum each row

In [50]:
df.sum(axis=1)

0    0.242898
1   -3.500340
2   -0.766265
3    2.787643
4    0.584384
5    3.361391
6   -0.656356
7    1.892388
8    1.327231
9    3.019816
dtype: float64

In [51]:
df.sum(axis = 'columns')

0    0.242898
1   -3.500340
2   -0.766265
3    2.787643
4    0.584384
5    3.361391
6   -0.656356
7    1.892388
8    1.327231
9    3.019816
dtype: float64

### 3. <a id=3> Creating a Clean DataFrame Function </a>
[Back to contents](#toc)

In [55]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
df.head()

Unnamed: 0,Seq.,President,Party,Bg,Im,Int,IQ,L,WR,AC,EAb,LA,CAb,OA,PL,RC,CAp,HE,EAp,DA,FPA,AM,EV,O
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8


Fix the data

In [134]:
def tweak_siena_pres(df_):
    
## Function to convert all int64s to uint8
    def int64_to_uint8(df_):
        cols = df_.select_dtypes('int64')
        return df_.astype({col:'uint8' for col in cols})

## Rename columns
    col_rename = {'Seq.':'Seq', 
                  'Bg':'Background',
                  'PL':'Party leadership',
                  'CAb':'Communication ability',
                  'RC': 'Relations with Congress', 
                  'CAp': 'Court appointments',
                  'HE': 'Handling of economy', 
                  'L': 'Luck',
                  'AC': 'Ability to compromise', 
                  'WR': 'Willing to take risks',
                  'EAp': 'Executive appointments', 
                  'OA': 'Overall ability',
                  'Im': 'Imagination', 
                  'DA': 'Domestic accomplishments',
                  'Int': 'Integrity', 
                  'EAb': 'Executive ability',
                  'FPA': 'Foreign policy accomplishments',
                  'LA': 'Leadership ability',
                  'IQ': 'Intelligence', 'AM ': 'Avoid crucial mistakes',
                  'EV': "Experts' view", 'O': 'Overall'}

    colNames = {}
    for key, value in col_rename.items():
        colClean = value.replace(' ','_')
        colNames[key] = colClean

## Create the dataframe
    return(df_
           .rename(columns = colNames)     
           .astype({'Party':'category'})
           .pipe(int64_to_uint8)
           .assign(Average_rank = lambda df_:(df_.select_dtypes('uint8')
                                                 .sum(axis=1)
                                                 .rank(method='dense')
                                                 .astype('uint8')),
                   Quartile = lambda df_:pd.qcut(df_.Average_rank, 4, labels='1st 2nd 3rd 4th'.split())
                  )
          )

In [137]:
df = tweak_siena_pres(df)
df.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


### 4. <a id=4> Math Methods in DataFrames </a>
[Back to contents](#toc)

In [150]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
pres = tweak_siena_pres(df)
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


#### a. Index Alignment

We can perform math operations of the dataframe. There are the math methods like .add and .div
and we also have dunder methods that allow us to use the operators like +, -, /, and *.
Note that the index will align when we perform math. To demonstrate alignment, I will add the
values from index values at rows 0-2 and column positions at index 0-3 and add then to the index
values from rows 1-5 and 0-4:

In [143]:
scores = pres.loc[:, 'Background':'Average_rank']
scores.head()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
1,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1
2,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13
3,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5
4,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7
5,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8


In [144]:
s1 = scores.iloc[:3, :4]
s1

Unnamed: 0,Background,Imagination,Integrity,Intelligence
1,7,7,1,10
2,3,13,4,4
3,2,2,14,1


In [146]:
s2 = scores.iloc[1:6, :5]
s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
2,3,13,4,4,24
3,2,2,14,1,8
4,4,6,7,3,16
5,9,14,11,18,6
6,1,9,6,5,29


In [149]:
s1 + s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
1,,,,,
2,6.0,26.0,8.0,8.0,
3,4.0,4.0,28.0,2.0,
4,,,,,
5,,,,,
6,,,,,


Only the overlapping rows (rows 2 and 3) and columns (Background through Intelligence) get
added together. The other values are missing!

### 5. <a id=5> Looping and Aggregation </a>
[Back to contents](#toc)

#### a. Looping

You can use a for loop with a dataframe, though you generally want to avoid for loops when doing
numerical manipulation. When I see a for loop with pandas code, it means this is a slow operation,
and you are not able to take advantage of the vectorization that speeds up many operations.
However, sometimes a for loop is appropriate (I use them when labeling plots).

If you need to loop over a dataframe, here are three methods for doing it. 
- The .iteritems method gives you a tuple with the column name and the column (a series). 
- The .iterrows method gives you a tuple with the index value and the row (converted into a series). 
- Finally, the .itertuples method gives you a row represented as a named tuple (with the index in position 0):

In [151]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
pres = tweak_siena_pres(df)
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


**iteritems()** - iteration over columns (col_name, series) tuple

In [159]:
for col_name, col in pres.iteritems():
    print(col_name,',',type(col))

Seq , <class 'pandas.core.series.Series'>
President , <class 'pandas.core.series.Series'>
Party , <class 'pandas.core.series.Series'>
Background , <class 'pandas.core.series.Series'>
Imagination , <class 'pandas.core.series.Series'>
Integrity , <class 'pandas.core.series.Series'>
Intelligence , <class 'pandas.core.series.Series'>
Luck , <class 'pandas.core.series.Series'>
Willing_to_take_risks , <class 'pandas.core.series.Series'>
Ability_to_compromise , <class 'pandas.core.series.Series'>
Executive_ability , <class 'pandas.core.series.Series'>
Leadership_ability , <class 'pandas.core.series.Series'>
Communication_ability , <class 'pandas.core.series.Series'>
Overall_ability , <class 'pandas.core.series.Series'>
Party_leadership , <class 'pandas.core.series.Series'>
Relations_with_Congress , <class 'pandas.core.series.Series'>
Court_appointments , <class 'pandas.core.series.Series'>
Handling_of_economy , <class 'pandas.core.series.Series'>
Executive_appointments , <class 'pandas.core.s

**iterrows()** - iteration over rows (index, row(as a series)) tuple

In [163]:
for idx, row in pres.iterrows():
    print(idx, type(row))

1 <class 'pandas.core.series.Series'>
2 <class 'pandas.core.series.Series'>
3 <class 'pandas.core.series.Series'>
4 <class 'pandas.core.series.Series'>
5 <class 'pandas.core.series.Series'>
6 <class 'pandas.core.series.Series'>
7 <class 'pandas.core.series.Series'>
8 <class 'pandas.core.series.Series'>
9 <class 'pandas.core.series.Series'>
10 <class 'pandas.core.series.Series'>
11 <class 'pandas.core.series.Series'>
12 <class 'pandas.core.series.Series'>
13 <class 'pandas.core.series.Series'>
14 <class 'pandas.core.series.Series'>
15 <class 'pandas.core.series.Series'>
16 <class 'pandas.core.series.Series'>
17 <class 'pandas.core.series.Series'>
18 <class 'pandas.core.series.Series'>
19 <class 'pandas.core.series.Series'>
20 <class 'pandas.core.series.Series'>
21 <class 'pandas.core.series.Series'>
22 <class 'pandas.core.series.Series'>
23 <class 'pandas.core.series.Series'>
24 <class 'pandas.core.series.Series'>
25 <class 'pandas.core.series.Series'>
26 <class 'pandas.core.series.Seri

**itertuples()** - iteration over rows as namedtuple (index as first item)

In [170]:
for tup in pres.itertuples():
    print(tup.Index, tup.Party)

1 Independent
2 Federalist
3 Democratic-Republican
4 Democratic-Republican
5 Democratic-Republican
6 Democratic-Republican
7 Democratic
8 Democratic
9 Whig
10 Independent
11 Democratic
12 Whig
13 Whig
14 Democratic
15 Democratic
16 Republican
17 Democratic
18 Republican
19 Republican
20 Republican
21 Republican
22 Democratic
23 Republican
24 Republican
25 Republican
26 Republican
27 Democratic
28 Republican
29 Republican
30 Republican
31 Democratic
32 Democratic
33 Republican
34 Democratic
35 Democratic
36 Republican
37 Republican
38 Democratic
39 Republican
40 Republican
41 Democratic
42 Republican
43 Democratic
44 Republican


#### b. Aggregations

In [171]:
scores = pres.loc[:, 'Background':'Average_rank']
scores.head()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
1,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1
2,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13
3,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5
4,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7
5,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8


Calculate the average of each row

In [175]:
scores.mean(axis = 'columns')

1      3.681818
2     14.454545
3      6.545455
4      9.636364
5     10.454545
6     17.181818
7     19.590909
8     25.681818
9     36.909091
10    34.409091
11    13.318182
12    29.500000
13    37.454545
14    39.409091
15    42.000000
16     4.045455
17    42.272727
18    24.227273
19    30.136364
20    27.272727
21    31.454545
22    22.181818
23    32.818182
24    19.727273
25     5.227273
26    21.318182
27    13.590909
28    38.772727
29    29.909091
30    31.954545
31     3.909091
32    11.818182
33     9.227273
34    12.727273
35    15.272727
36    26.909091
37    26.000000
38    26.818182
39    14.545455
40    20.818182
41    14.636364
42    30.363636
43    15.818182
44    39.772727
dtype: float64

In [176]:
scores.mean(axis = 1)

1      3.681818
2     14.454545
3      6.545455
4      9.636364
5     10.454545
6     17.181818
7     19.590909
8     25.681818
9     36.909091
10    34.409091
11    13.318182
12    29.500000
13    37.454545
14    39.409091
15    42.000000
16     4.045455
17    42.272727
18    24.227273
19    30.136364
20    27.272727
21    31.454545
22    22.181818
23    32.818182
24    19.727273
25     5.227273
26    21.318182
27    13.590909
28    38.772727
29    29.909091
30    31.954545
31     3.909091
32    11.818182
33     9.227273
34    12.727273
35    15.272727
36    26.909091
37    26.000000
38    26.818182
39    14.545455
40    20.818182
41    14.636364
42    30.363636
43    15.818182
44    39.772727
dtype: float64

In [177]:
scores.sum(axis = 'columns')

1      81
2     318
3     144
4     212
5     230
6     378
7     431
8     565
9     812
10    757
11    293
12    649
13    824
14    867
15    924
16     89
17    930
18    533
19    663
20    600
21    692
22    488
23    722
24    434
25    115
26    469
27    299
28    853
29    658
30    703
31     86
32    260
33    203
34    280
35    336
36    592
37    572
38    590
39    320
40    458
41    322
42    668
43    348
44    875
dtype: int64

In [178]:
scores.sum(axis = 0)

Background                        968
Imagination                       957
Integrity                         990
Intelligence                      990
Luck                              990
Willing_to_take_risks             953
Ability_to_compromise             968
Executive_ability                 978
Leadership_ability                990
Communication_ability             990
Overall_ability                   990
Party_leadership                  990
Relations_with_Congress           979
Court_appointments                990
Handling_of_economy               990
Executive_appointments            990
Domestic_accomplishments          990
Foreign_policy_accomplishments    990
AM                                990
Experts'_view                     990
Overall                           990
Average_rank                      990
dtype: int64

#### c. Multiple Aggregations using Agg

In [179]:
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


We can pass in a dictionary to perform multiple aggregations on a column.

In [183]:
pres.agg({'Luck':['count', 'size'],
         'Overall':['count', 'max']})

Unnamed: 0,Luck,Overall
count,44.0,44.0
size,44.0,
max,,44.0


You can use a keyword argument with a tuple to specify the index value of the resultant
aggregation:

In [186]:
pres.agg(Intelligence_count = ('Intelligence', 'count'),
        Intelligence_size = ('Intelligence', 'size'))

Unnamed: 0,Intelligence
Intelligence_count,44
Intelligence_size,44


#### d. Describe
The .describe method is a meta-aggregation that returns a dataframe with summary statistics
for each numeric columns:

In [187]:
pres.describe()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
count,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,22.0,21.75,22.5,22.5,22.5,21.659091,22.0,22.227273,22.5,22.5,22.5,22.5,22.25,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5
std,12.409674,12.519984,12.845233,12.845233,12.845233,11.892822,12.409674,12.500909,12.845233,12.845233,12.845233,12.845233,12.519984,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,11.75,11.0,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75
50%,22.0,21.5,22.5,22.5,22.5,22.5,22.0,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5
75%,32.25,32.25,33.25,33.25,33.25,31.25,32.25,32.25,33.25,33.25,33.25,33.25,33.0,33.25,33.25,33.25,33.25,33.25,33.25,33.25,33.25,33.25
max,43.0,43.0,44.0,44.0,44.0,41.0,43.0,43.0,44.0,44.0,44.0,44.0,43.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0


#### e. The .apply method vs .pipe method
Like the series, the dataframe has an .apply method. Like the series method, you should be wary of
using the dataframe method. More specifically, if you are dealing with numbers, you might want
to see if you can operate in a vectorized way.

Also, keep in mind that a dataframe is two-dimensional. So rather than applying a function to
a single value, when you call .apply on a dataframe, you work on a whole row or a whole column.
Because of that, I find that I rarely use this method.
Most of the .apply examples you find in the wild are silly examples that show how .apply works,
but also give a false impression that you should be everywhere, including using it for these silly
examples.

For example, if you wanted to calculate the spread of the presidential rankings for each row, I
would do this:

In [190]:
(pres
 .select_dtypes('number')
 .pipe(lambda df_:df_.max(axis = 'columns') - df_.min(axis='columns')))

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: uint8

In [194]:
def test(df_):
    return df_.max(axis = 'columns') -df_.min(axis='columns')

(pres
 .select_dtypes('number')
 .pipe(test))

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: uint8

The .apply version looks like this. They look pretty similar but the former does an optimized max and min calculation, while the
latter does a separate calculation for each row.

In [197]:
(pres
 .select_dtypes('number')
 .apply(lambda _row: _row.max() - _row.min(), axis = 'columns')
)

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: int8

### 6. <a id=6> Creating and Updating Columns</a>
[Back to contents](#toc)
    