In [1]:
import pandas as pd
import sys
%matplotlib inline

In [2]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)]
Pandas version 0.20.1


# Group By  

Everything you need to know about grouping in Pandas

In [3]:
# sample dataset
raw = pd.DataFrame({'date':pd.date_range('1/1/2014', periods=10),
                    'symbol':['A','A','B','B','C','C','C','C','C','C'],
                    'key':['buy','sell','buy','sell','buy','sell','buy','sell','buy','sell'],
                    'volume':[213,4325,456,76,45,43,432,87,67,65]})
raw

Unnamed: 0,date,key,symbol,volume
0,2014-01-01,buy,A,213
1,2014-01-02,sell,A,4325
2,2014-01-03,buy,B,456
3,2014-01-04,sell,B,76
4,2014-01-05,buy,C,45
5,2014-01-06,sell,C,43
6,2014-01-07,buy,C,432
7,2014-01-08,sell,C,87
8,2014-01-09,buy,C,67
9,2014-01-10,sell,C,65


For this example it only makes sense to group by the ***symbol*** or the ***key*** columns. All of the other columns have unique values and not much can be done with grouping in those cases. Lets us begin...

In [4]:
# make a copy of our data
df = raw.copy()

# How to group by one column

In [5]:
group1 = df.groupby('symbol')
group1

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000009D1BBE0>

# How to group by multiple columns

In [6]:
group2 = df.groupby(['symbol','key'])
group2

<pandas.core.groupby.DataFrameGroupBy object at 0x00000000093F4160>

# How to iterate over the group  

In [7]:
# i - is the value you are grouping by
# g - is the data associated with i

for i, g in group1:
    print(i)
    print(g)
    print('--------')

A
        date   key symbol  volume
0 2014-01-01   buy      A     213
1 2014-01-02  sell      A    4325
--------
B
        date   key symbol  volume
2 2014-01-03   buy      B     456
3 2014-01-04  sell      B      76
--------
C
        date   key symbol  volume
4 2014-01-05   buy      C      45
5 2014-01-06  sell      C      43
6 2014-01-07   buy      C     432
7 2014-01-08  sell      C      87
8 2014-01-09   buy      C      67
9 2014-01-10  sell      C      65
--------


In [8]:
for (a,b), g in group2:
    print((a,b))
    print(g)
    print('--------')

('A', 'buy')
        date  key symbol  volume
0 2014-01-01  buy      A     213
--------
('A', 'sell')
        date   key symbol  volume
1 2014-01-02  sell      A    4325
--------
('B', 'buy')
        date  key symbol  volume
2 2014-01-03  buy      B     456
--------
('B', 'sell')
        date   key symbol  volume
3 2014-01-04  sell      B      76
--------
('C', 'buy')
        date  key symbol  volume
4 2014-01-05  buy      C      45
6 2014-01-07  buy      C     432
8 2014-01-09  buy      C      67
--------
('C', 'sell')
        date   key symbol  volume
5 2014-01-06  sell      C      43
7 2014-01-08  sell      C      87
9 2014-01-10  sell      C      65
--------


# How to apply built-in functions like sum and std

In [9]:
group1.sum()

Unnamed: 0_level_0,volume
symbol,Unnamed: 1_level_1
A,4538
B,532
C,739


In [10]:
group2.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,volume
symbol,key,Unnamed: 2_level_1
A,buy,213
A,sell,4325
B,buy,456
B,sell,76
C,buy,544
C,sell,195


In [11]:
group1.std()

Unnamed: 0_level_0,volume
symbol,Unnamed: 1_level_1
A,2907.623084
B,268.700577
C,152.15836


In [12]:
group2.std()

Unnamed: 0_level_0,Unnamed: 1_level_0,volume
symbol,key,Unnamed: 2_level_1
A,buy,
A,sell,
B,buy,
B,sell,
C,buy,217.362217
C,sell,22.0


# How does "group by" work  

As you can see below, the ***getmax*** function just returns what ever you feed it. So what exactly is going on here?  

In [13]:
# create function
def getmax(group):
    return group

group1.apply(getmax)

Unnamed: 0,date,key,volume
0,2014-01-01,buy,213
1,2014-01-02,sell,4325
2,2014-01-03,buy,456
3,2014-01-04,sell,76
4,2014-01-05,buy,45
5,2014-01-06,sell,43
6,2014-01-07,buy,432
7,2014-01-08,sell,87
8,2014-01-09,buy,67
9,2014-01-10,sell,65


We basically are sending the function getmax each group one at a time and then gluing the pieces back together.

In [14]:
# one piece of the group
group1.get_group('A')
#group2.get_group(('A','buy'))

Unnamed: 0,date,key,volume
0,2014-01-01,buy,213
1,2014-01-02,sell,4325


In [15]:
# another piece
group1.get_group('B')

Unnamed: 0,date,key,volume
2,2014-01-03,buy,456
3,2014-01-04,sell,76


In [16]:
# send a piece to the function
getmax(group1.get_group('B'))

Unnamed: 0,date,key,volume
2,2014-01-03,buy,456
3,2014-01-04,sell,76


As you can see the function simply returns what it was fed. So how does Pandas glue the pieces together? After it runs each group through the function, it uses the ***concat*** function to glue the pieces together.

In [17]:
r1 = getmax(group1.get_group('A'))
r2 = getmax(group1.get_group('B'))

pd.concat([r1,r2])

Unnamed: 0,date,key,volume
0,2014-01-01,buy,213
1,2014-01-02,sell,4325
2,2014-01-03,buy,456
3,2014-01-04,sell,76


# How to add a new column to a group

In [18]:
# get max number of each group
def getmax(group):
    group['max'] = group['volume'].max()
    return group

group1.apply(getmax)

Unnamed: 0,date,key,volume,max
0,2014-01-01,buy,213,4325
1,2014-01-02,sell,4325,4325
2,2014-01-03,buy,456,456
3,2014-01-04,sell,76,456
4,2014-01-05,buy,45,432
5,2014-01-06,sell,43,432
6,2014-01-07,buy,432,432
7,2014-01-08,sell,87,432
8,2014-01-09,buy,67,432
9,2014-01-10,sell,65,432


# How to sum a column but keep the same shape of the df

In [19]:
df['sumvolume'] = group1['volume'].transform(sum)
df

Unnamed: 0,date,key,symbol,volume,sumvolume
0,2014-01-01,buy,A,213,4538
1,2014-01-02,sell,A,4325,4538
2,2014-01-03,buy,B,456,532
3,2014-01-04,sell,B,76,532
4,2014-01-05,buy,C,45,739
5,2014-01-06,sell,C,43,739
6,2014-01-07,buy,C,432,739
7,2014-01-08,sell,C,87,739
8,2014-01-09,buy,C,67,739
9,2014-01-10,sell,C,65,739


# How to perform multiple aggregations at the same time

In [20]:
group1.agg([sum, max, min, 'std'])

Unnamed: 0_level_0,volume,volume,volume,volume
Unnamed: 0_level_1,sum,max,min,std
symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,4538,4325,213,2907.623084
B,532,456,76,268.700577
C,739,432,43,152.15836


# How to choose aggregation methods per column

In [21]:
labels = {                                                                                    
    'date':'first',                                                                                                                                                                                                       
    'volume': 'sum'
}

group1.agg(labels)

Unnamed: 0_level_0,date,volume
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2014-01-01,4538
B,2014-01-03,532
C,2014-01-05,739


# How to add custom labels to multiple aggregations

In [22]:
labels = [                                                  
    ('getfirst', 'first'),                                                                                                     
    ('getmax', 'max'),                                                                                                       
    ('getmin', 'min'),                                                                                                        
    ('getlast', 'last'),                                                                                                    
    ('getsum', 'sum')
    ]

group2.agg(labels)

Unnamed: 0_level_0,Unnamed: 1_level_0,volume,volume,volume,volume,volume
Unnamed: 0_level_1,Unnamed: 1_level_1,getfirst,getmax,getmin,getlast,getsum
symbol,key,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,buy,213,213,213,213,213
A,sell,4325,4325,4325,4325,4325
B,buy,456,456,456,456,456
B,sell,76,76,76,76,76
C,buy,45,432,45,67,544
C,sell,43,87,43,65,195


**Author:** [HEDARO](http://www.hedaro.com)  