## Task
Explore grouping and aggregation in pandas

## Notebook summary
* `groupby` 1 or more columns
* `groupby` info in arrays or values in dicts with index names as keys
* `groupby` functions(index)
* iterate over group levels, convert grouped data into dicts, select columns for aggregation
* `aggregate`/`agg` - groupby 1 or more functions for 1 or more columns in df with custom column name
* `transform`
* `apply`
* `filter`
* pivot tables, cross-tabulation

## References
* *Python for Data Analysis*, Wes McKinney, O'Reilly, 2012
* *Numerical Python*, Robert Johansson, APress, 2015
* *Python Data Science Handbook*, Jake VanderPlas, O'Reilly, 2016


In [7]:
# display output from all cmds just like Python shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import platform
print 'python.version = ', platform.python_version()
import IPython
print 'ipython.version =', IPython.version_info

import numpy as np
print 'numpy.version = ', np.__version__
import pandas as pd
print 'pandas.version = ', pd.__version__
from pandas import Series, DataFrame


python.version =  2.7.10
ipython.version = (5, 1, 0, '')
numpy.version =  1.11.2
pandas.version =  0.19.1


**`groupby`** creates a GroupBy object. No computation is done until an aggregation is applied.

In [8]:
# groupby

df = pd.DataFrame({
    'key1': ['a','b','c','a','b','c','a','b','c','a'],
    'key2': ['foo']*5 + ['bar']*5,
    'val': np.round(np.random.rand(10),2)
})
df.index.name = 'Index'
df

print '-----'

grp_key1 = df.groupby('key1') # group by single column
type(grp_key1)

print 'count:'
type(grp_key1.count())
grp_key1.count()

print 'size:'
type(grp_key1.size())
grp_key1.size()

print 'first:'
grp_key1.first()

print 'head(2):'
grp_key1.head(2)

print 'mean:'
grp_key1.mean()

print 'group by key1 & key2:'
grp_key1_key2 = df.groupby(['key1','key2']).count() # group by multiple columns
type(grp_key1_key2)
grp_key1_key2
grp_key1_key2.unstack()

print '---'
print 'group by another array:'
fruits = ['Orange']*5 + ['Grape']*5
fruits
df.groupby(fruits).count()

type(df.groupby([fruits, 'key1']))
df.groupby([fruits, 'key1']).count()
df.groupby([fruits, 'key1']).count().unstack()
df.groupby([fruits, 'key1']).size()


Unnamed: 0_level_0,key1,key2,val
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,a,foo,0.79
1,b,foo,0.2
2,c,foo,0.61
3,a,foo,0.4
4,b,foo,0.96
5,c,bar,0.28
6,a,bar,0.1
7,b,bar,0.73
8,c,bar,0.74
9,a,bar,0.99


-----


pandas.core.groupby.DataFrameGroupBy

count:


pandas.core.frame.DataFrame

Unnamed: 0_level_0,key2,val
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,4
b,3,3
c,3,3


size:


pandas.core.series.Series

key1
a    4
b    3
c    3
dtype: int64

first:


Unnamed: 0_level_0,key2,val
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,foo,0.79
b,foo,0.2
c,foo,0.61


head(2):


Unnamed: 0_level_0,key1,key2,val
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,a,foo,0.79
1,b,foo,0.2
2,c,foo,0.61
3,a,foo,0.4
4,b,foo,0.96
5,c,bar,0.28


mean:


Unnamed: 0_level_0,val
key1,Unnamed: 1_level_1
a,0.57
b,0.63
c,0.543333


group by key1 & key2:


pandas.core.frame.DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,val
key1,key2,Unnamed: 2_level_1
a,bar,2
a,foo,2
b,bar,1
b,foo,2
c,bar,2
c,foo,1


Unnamed: 0_level_0,val,val
key2,bar,foo
key1,Unnamed: 1_level_2,Unnamed: 2_level_2
a,2,2
b,1,2
c,2,1


---
group by another array:


['Orange',
 'Orange',
 'Orange',
 'Orange',
 'Orange',
 'Grape',
 'Grape',
 'Grape',
 'Grape',
 'Grape']

Unnamed: 0,key1,key2,val
Grape,5,5,5
Orange,5,5,5


pandas.core.groupby.DataFrameGroupBy

Unnamed: 0_level_0,Unnamed: 1_level_0,key2,val
Unnamed: 0_level_1,key1,Unnamed: 2_level_1,Unnamed: 3_level_1
Grape,a,2,2
Grape,b,1,1
Grape,c,2,2
Orange,a,2,2
Orange,b,2,2
Orange,c,1,1


Unnamed: 0_level_0,key2,key2,key2,val,val,val
key1,a,b,c,a,b,c
Grape,2,1,2,2,1,2
Orange,2,2,1,2,2,1


        key1
Grape   a       2
        b       1
        c       2
Orange  a       2
        b       2
        c       1
dtype: int64

In [9]:
# more groupby

# iterate over group levels and associated df
grp_fruits = df.groupby(fruits)
for key, grp in grp_fruits:
    print key
    print grp

print '---'
    
grp_fruits_key1 = df.groupby([fruits, 'key1'])
for (key1, key2), grp in grp_fruits_key1:
    print key1, key2
    print grp
    print ''
    
print '---'

# convert groups into dict with group label as key and associated df as value
l = list(grp_fruits)
print l
print type(l[0])
print '---'
print dict(l)
dict(l)['Orange']


print '---'

# select column(s) from GroupBy object
type(grp_fruits['key1']) # return Series since single column name is passed
grp_fruits['key1'].value_counts()
grp_fruits['key1'].count()
grp_fruits[['key1']].count() #  returns dataframe since list of columns names is passed


Grape
      key1 key2   val
Index                
5        c  bar  0.28
6        a  bar  0.10
7        b  bar  0.73
8        c  bar  0.74
9        a  bar  0.99
Orange
      key1 key2   val
Index                
0        a  foo  0.79
1        b  foo  0.20
2        c  foo  0.61
3        a  foo  0.40
4        b  foo  0.96
---
Grape a
      key1 key2   val
Index                
6        a  bar  0.10
9        a  bar  0.99

Grape b
      key1 key2   val
Index                
7        b  bar  0.73

Grape c
      key1 key2   val
Index                
5        c  bar  0.28
8        c  bar  0.74

Orange a
      key1 key2   val
Index                
0        a  foo  0.79
3        a  foo  0.40

Orange b
      key1 key2   val
Index                
1        b  foo  0.20
4        b  foo  0.96

Orange c
      key1 key2   val
Index                
2        c  foo  0.61

---
[('Grape',       key1 key2   val
Index                
5        c  bar  0.28
6        a  bar  0.10
7        b  bar  0.73
8        

Unnamed: 0_level_0,key1,key2,val
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,a,foo,0.79
1,b,foo,0.2
2,c,foo,0.61
3,a,foo,0.4
4,b,foo,0.96


---


pandas.core.groupby.SeriesGroupBy

        key1
Grape   a       2
        c       2
        b       1
Orange  a       2
        b       2
        c       1
Name: key1, dtype: int64

Grape     5
Orange    5
Name: key1, dtype: int64

Unnamed: 0,key1
Grape,5
Orange,5


In [10]:
# group by info in linked dict

# dict with linked info
info_dict = {
    'A': 'Orange',
    'B': 'Grape',
    'C': 'Orange',
    'D': 'Grape',
    'E': 'Orange'
}
info_dict

# df with values
df = pd.DataFrame({
    'A': np.random.rand(5),
    'B': np.random.rand(5),
    'C': np.random.rand(5),
    'D': np.random.rand(5),
    'E': np.random.rand(5)
}, index=['foo','bar','baz','qux','quuz'])

df

g = df.groupby(info_dict, axis=1)
g.count()

print '---'

# group by function of index
df.groupby(len).count() # group by function only
g = df.groupby([len, ['L1', 'L2', 'L1', 'L2', 'L1']]).count() # function + list as groupers
g.index.names = ['Len', 'Levels']
g

g.groupby(level='Levels').sum()


{'A': 'Orange', 'B': 'Grape', 'C': 'Orange', 'D': 'Grape', 'E': 'Orange'}

Unnamed: 0,A,B,C,D,E
foo,0.990268,0.873345,0.909608,0.413925,0.968044
bar,0.275418,0.781254,0.208339,0.682691,0.374432
baz,0.267124,0.938055,0.926194,0.067957,0.240107
qux,0.516881,0.7905,0.755569,0.552929,0.533042
quuz,0.542944,0.891574,0.715213,0.935748,0.276824


Unnamed: 0,Grape,Orange
foo,2,3
bar,2,3
baz,2,3
qux,2,3
quuz,2,3


---


Unnamed: 0,A,B,C,D,E
3,4,4,4,4,4
4,1,1,1,1,1


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,E
Len,Levels,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,L1,2,2,2,2,2
3,L2,2,2,2,2,2
4,L1,1,1,1,1,1


Unnamed: 0_level_0,A,B,C,D,E
Levels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
L1,3,3,3,3,3
L2,2,2,2,2,2


In [11]:
# aggregate by various functions (optimized & non-optimized)

df.groupby(info_dict, axis=1).count()
df.groupby(info_dict, axis=1).mean()

df.groupby(info_dict, axis=1)['A'].count()
df.groupby(info_dict, axis=1)['A'].quantile(0.5)
df.groupby(info_dict, axis=1)['A'].describe()
df.groupby(info_dict, axis=1).describe()

print '---'

# aggregate by multiple functions, with custom names for columns
df.groupby(info_dict, axis=1).agg('mean')
# df.groupby(info_dict, axis=1).agg(['count', 'mean']) - will not work since axis other than 0 is not supported

df.groupby(['L1','L2','L1','L2','L1']).agg([('MyCount','count'), ('MyMean','mean')])
df.groupby([['L1','L2','L1','L2','L1'],['AAA','AAA','AAA','BBB','BBB']]).agg([('MyCount','count'), ('MyMean','mean')])

# different aggregation functions for different columns
df.groupby(['L1','L2','L1','L2','L1']).agg({'A':['count', 'sum'], 'B':'mean'})
df.groupby(['L1','L2','L1','L2','L1'], as_index=False).agg({'A':['count', 'sum'], 'B':'mean'})


Unnamed: 0,Grape,Orange
foo,2,3
bar,2,3
baz,2,3
qux,2,3
quuz,2,3


Unnamed: 0,Grape,Orange
foo,0.643635,0.955973
bar,0.731972,0.286063
baz,0.503006,0.477808
qux,0.671714,0.60183
quuz,0.913661,0.511661


Grape     2
Orange    3
Name: A, dtype: int64

Grape     0.396149
Orange    0.542944
Name: A, dtype: float64

Grape   count    2.000000
        mean     0.396149
        std      0.170740
        min      0.275418
        25%      0.335784
        50%      0.396149
        75%      0.456515
        max      0.516881
Orange  count    3.000000
        mean     0.600112
        std      0.364946
        min      0.267124
        25%      0.405034
        50%      0.542944
        75%      0.766606
        max      0.990268
Name: A, dtype: float64

Unnamed: 0_level_0,Grape,Grape,Orange,Orange,Orange
Unnamed: 0_level_1,B,D,A,C,E
count,5.0,5.0,5.0,5.0,5.0
mean,0.854945,0.53065,0.518527,0.702984,0.47849
std,0.0674,0.322268,0.293872,0.291592,0.296211
min,0.781254,0.067957,0.267124,0.208339,0.240107
25%,0.7905,0.413925,0.275418,0.715213,0.276824
50%,0.873345,0.552929,0.516881,0.755569,0.374432
75%,0.891574,0.682691,0.542944,0.909608,0.533042
max,0.938055,0.935748,0.990268,0.926194,0.968044


---


Unnamed: 0,Grape,Orange
foo,0.643635,0.955973
bar,0.731972,0.286063
baz,0.503006,0.477808
qux,0.671714,0.60183
quuz,0.913661,0.511661


Unnamed: 0_level_0,A,A,B,B,C,C,D,D,E,E
Unnamed: 0_level_1,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean
L1,3,0.600112,3,0.900991,3,0.850338,3,0.472544,3,0.494992
L2,2,0.396149,2,0.785877,2,0.481954,2,0.61781,2,0.453737


Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B,C,C,D,D,E,E
Unnamed: 0_level_1,Unnamed: 1_level_1,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean,MyCount,MyMean
L1,AAA,2,0.628696,2,0.9057,2,0.917901,2,0.240941,2,0.604076
L1,BBB,1,0.542944,1,0.891574,1,0.715213,1,0.935748,1,0.276824
L2,AAA,1,0.275418,1,0.781254,1,0.208339,1,0.682691,1,0.374432
L2,BBB,1,0.516881,1,0.7905,1,0.755569,1,0.552929,1,0.533042


Unnamed: 0_level_0,A,A,B
Unnamed: 0_level_1,count,sum,mean
L1,3,1.800337,0.900991
L2,2,0.792299,0.785877


Unnamed: 0_level_0,A,A,B
Unnamed: 0_level_1,count,sum,mean
0,3,1.800337,0.900991
1,2,0.792299,0.785877


In [22]:
# transform 

df.groupby(info_dict, axis=1).count()
df.groupby(info_dict, axis=1).transform(np.count_nonzero) 
df.groupby(info_dict, axis=1).transform(np.mean)

print '-----'

# transform with custom function to subtract group mean from each group
a = df.groupby(info_dict, axis=1).transform(lambda x: x - x.mean()) 
a
a.groupby(info_dict, axis=1).mean()


Unnamed: 0,Grape,Orange
foo,2,3
bar,2,3
baz,2,3
qux,2,3
quuz,2,3


Unnamed: 0,A,B,C,D,E
foo,3,2,3,2,3
bar,3,2,3,2,3
baz,3,2,3,2,3
qux,3,2,3,2,3
quuz,3,2,3,2,3


Unnamed: 0,A,B,C,D,E
foo,0.955973,0.643635,0.955973,0.643635,0.955973
bar,0.286063,0.731972,0.286063,0.731972,0.286063
baz,0.477808,0.503006,0.477808,0.503006,0.477808
qux,0.60183,0.671714,0.60183,0.671714,0.60183
quuz,0.511661,0.913661,0.511661,0.913661,0.511661


-----


Unnamed: 0,A,B,C,D,E
foo,0.034295,0.22971,-0.046366,-0.22971,0.012071
bar,-0.010645,0.049282,-0.077724,-0.049282,0.088369
baz,-0.210684,0.435049,0.448385,-0.435049,-0.237701
qux,-0.08495,0.118785,0.153738,-0.118785,-0.068788
quuz,0.031284,-0.022087,0.203553,0.022087,-0.234837


Unnamed: 0,Grape,Orange
foo,0.0,7.401487e-17
bar,0.0,1.850372e-17
baz,5.5511150000000004e-17,-3.700743e-17
qux,-5.5511150000000004e-17,7.401487e-17
quuz,0.0,-3.700743e-17


In [13]:
# apply

df.groupby(info_dict, axis=1).count()
df.groupby(info_dict, axis=1).apply(np.count_nonzero)
# df.groupby(info_dict, axis=1).apply(np.mean) - error abt Series having no axis named 1

df.groupby(info_dict, axis=1).describe()
df.groupby(info_dict, axis=1).apply(lambda x: x.describe())

# ToDo


Unnamed: 0,Grape,Orange
foo,2,3
bar,2,3
baz,2,3
qux,2,3
quuz,2,3


Grape     10
Orange    15
dtype: int64

Unnamed: 0_level_0,Grape,Grape,Orange,Orange,Orange
Unnamed: 0_level_1,B,D,A,C,E
count,5.0,5.0,5.0,5.0,5.0
mean,0.854945,0.53065,0.518527,0.702984,0.47849
std,0.0674,0.322268,0.293872,0.291592,0.296211
min,0.781254,0.067957,0.267124,0.208339,0.240107
25%,0.7905,0.413925,0.275418,0.715213,0.276824
50%,0.873345,0.552929,0.516881,0.755569,0.374432
75%,0.891574,0.682691,0.542944,0.909608,0.533042
max,0.938055,0.935748,0.990268,0.926194,0.968044


Unnamed: 0_level_0,Grape,Grape,Orange,Orange,Orange
Unnamed: 0_level_1,B,D,A,C,E
count,5.0,5.0,5.0,5.0,5.0
mean,0.854945,0.53065,0.518527,0.702984,0.47849
std,0.0674,0.322268,0.293872,0.291592,0.296211
min,0.781254,0.067957,0.267124,0.208339,0.240107
25%,0.7905,0.413925,0.275418,0.715213,0.276824
50%,0.873345,0.552929,0.516881,0.755569,0.374432
75%,0.891574,0.682691,0.542944,0.909608,0.533042
max,0.938055,0.935748,0.990268,0.926194,0.968044


In [19]:
# filter

# ToDo


In [20]:
# Pivot tables - multi-dimensional generalization of groupby
# pivot_table(value, index, columns, agg_func, dropna, fill_val, margins, margins_name)

# ToDo


In [21]:
# Cross tabulation

# ToDo
