In [1]:
import pandas as pd
import numpy as np

In [2]:
clothes = pd.DataFrame({'type': ['pants', 'shirt', 'shirt', 'pants', 'shirt', 'pants'],
                       'color': ['red', 'blue', 'green', 'blue', 'green', 'red'],
                       'price_usd': [20, 35, 50, 40, 100, 75],
                       'mass_g': [125, 440, 680, 200, 395, 485]})


clothes

Unnamed: 0,type,color,price_usd,mass_g
0,pants,red,20,125
1,shirt,blue,35,440
2,shirt,green,50,680
3,pants,blue,40,200
4,shirt,green,100,395
5,pants,red,75,485


In [4]:
#Grouping the dataframe by type results in a DataFrameGroupBy object:
grouped = clothes.groupby('type')
print(grouped)
print(type(grouped))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe036db4d10>
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


In [5]:
grouped = clothes.groupby('type')
grouped.mean()

Unnamed: 0_level_0,price_usd,mass_g
type,Unnamed: 1_level_1,Unnamed: 2_level_1
pants,45.0,270.0
shirt,61.666667,505.0


In [6]:
#groups may be created based on multiple columns:
clothes.groupby(['type', 'color']).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,price_usd,mass_g
type,color,Unnamed: 2_level_1,Unnamed: 3_level_1
pants,blue,40,200
pants,red,20,125
shirt,blue,35,440
shirt,green,50,395


In [7]:
#To simply return the number of observations there are in each group, use the size() method
clothes.groupby(['type', 'color']).size()

type   color
pants  blue     1
       red      2
shirt  blue     1
       green    2
dtype: int64

In [9]:
clothes.groupby(['color']).size()

color
blue     2
green    2
red      2
dtype: int64

    count(): The number of non-null values in each group

    sum(): The sum of values in each group

    mean(): The mean of values in each group

    median(): The median of values in each group

    min(): The minimum value in each group

    max(): The maximum value in each group

    std(): The standard deviation of values in each group

    var(): The variance of values in each group

In [12]:
#The following example applies the sum() and mean() functions to the price and mass_g columns of the clothes dataframe.
clothes[['price_usd', 'mass_g']].agg(['sum', 'mean'])

Unnamed: 0,price_usd,mass_g
sum,320.0,2325.0
mean,53.333333,387.5


In [14]:
#In this next example, different functions are applied to different columns.
clothes.agg({'price_usd': 'sum',
            'mass_g': ['mean', 'median']
            })

Unnamed: 0,price_usd,mass_g
sum,320.0,
mean,,387.5
median,,417.5


In [15]:
#The following example applies the sum() and mean() functions across axis 1. 
#In other words, instead of applying the functions down each column, 
#they’re applied over each row.
clothes[['price_usd', 'mass_g']].agg(['sum', 'mean'], axis=1)

Unnamed: 0,sum,mean
0,145.0,72.5
1,475.0,237.5
2,730.0,365.0
3,240.0,120.0
4,495.0,247.5
5,560.0,280.0


In [16]:
clothes[['price_usd', 'mass_g']]

Unnamed: 0,price_usd,mass_g
0,20,125
1,35,440
2,50,680
3,40,200
4,100,395
5,75,485


The groupby() and agg() functions are often used together. In such cases, first apply the groupby() function to a dataframe, then apply the agg() function to the result of the groupby. For reference, here is the clothes dataframe once again.

In [17]:
clothes

Unnamed: 0,type,color,price_usd,mass_g
0,pants,red,20,125
1,shirt,blue,35,440
2,shirt,green,50,680
3,pants,blue,40,200
4,shirt,green,100,395
5,pants,red,75,485


In [18]:
'''In the following example, the items in clothes are grouped by color, then each of those groups has the mean() and max() functions applied to them at the price_usd and mass_g columns.'''

'In the following example, the items in clothes are grouped by color, then each of those groups has the mean() and max() functions applied to them at the price_usd and mass_g columns.'

In [20]:
clothes.groupby('color').agg({'price_usd': ['mean', 'max'],
                             'mass_g': ['mean', 'max']})

Unnamed: 0_level_0,price_usd,price_usd,mass_g,mass_g
Unnamed: 0_level_1,mean,max,mean,max
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
blue,37.5,40,320.0,440
green,75.0,100,537.5,680
red,47.5,75,305.0,485


In [26]:
# The result is the same with the above test
# Because there were only two numeric columns 
# if there were more than 2 the result would have been different
clothes.groupby('color').agg(['mean', 'max'])

Unnamed: 0_level_0,price_usd,price_usd,mass_g,mass_g
Unnamed: 0_level_1,mean,max,mean,max
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
blue,37.5,40,320.0,440
green,75.0,100,537.5,680
red,47.5,75,305.0,485


In [27]:
grouped = clothes.groupby(['color', 'type']).agg(['mean', 'min'])
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,price_usd,price_usd,mass_g,mass_g
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,mean,min
color,type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
blue,pants,40.0,40,200.0,200
blue,shirt,35.0,35,440.0,440
green,shirt,75.0,50,537.5,395
red,pants,47.5,20,305.0,125


In [29]:
grouped.index

MultiIndex([( 'blue', 'pants'),
            ( 'blue', 'shirt'),
            ('green', 'shirt'),
            (  'red', 'pants')],
           names=['color', 'type'])

In [30]:
grouped.columns

MultiIndex([('price_usd', 'mean'),
            ('price_usd',  'min'),
            (   'mass_g', 'mean'),
            (   'mass_g',  'min')],
           )

In [31]:
#To select a first-level (top) column:
grouped.loc[:, 'price_usd']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min
color,type,Unnamed: 2_level_1,Unnamed: 3_level_1
blue,pants,40.0,40
blue,shirt,35.0,35
green,shirt,75.0,50
red,pants,47.5,20


In [32]:
#To select a second-level (bottom) column:
grouped.loc[:, ('price_usd', 'min')]

color  type 
blue   pants    40
       shirt    35
green  shirt    50
red    pants    20
Name: (price_usd, min), dtype: int64

In [33]:
#To select first-level (left-most) row:
grouped.loc['blue', :]

Unnamed: 0_level_0,price_usd,price_usd,mass_g,mass_g
Unnamed: 0_level_1,mean,min,mean,min
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
pants,40.0,40,200.0,200
shirt,35.0,35,440.0,440


In [34]:
#To select a bottom-level (right-most) row:

grouped.loc[('green', 'shirt'), :]

price_usd  mean     75.0
           min      50.0
mass_g     mean    537.5
           min     395.0
Name: (green, shirt), dtype: float64

In [35]:
# And you can even select individual values:
grouped.loc[('blue', 'shirt'), ('mass_g', 'mean')]

440.0

In [36]:
#If you want to remove the row MultiIndex from a groupby result, 
# include as_index=False as a parameter to your groupby() statement:

clothes.groupby(['color', 'type'], as_index=False).mean()

Unnamed: 0,color,type,price_usd,mass_g
0,blue,pants,40.0,200.0
1,blue,shirt,35.0,440.0
2,green,shirt,75.0,537.5
3,red,pants,47.5,305.0
