# Comprehensive Guide to Grouping and Aggregating with Pandas
Chris Mofitt. "Comprehensive Guide to Grouping and Aggregating with Pandas". _Practical Business Python_, 9 Nov. 2020, https://pbpython.com/groupby-agg.html.

In [50]:
import pandas as pd
import seaborn as sns

In [51]:
df = sns.load_dataset('titanic')

## Pandas aggregation options

### List

In [5]:
df['fare'].agg(['sum', 'mean'])

sum     28693.949300
mean       32.204208
Name: fare, dtype: float64

### Dictionary

In [8]:
df.agg({'fare': ['sum', 'mean'],
         'sex': ['count']})


Unnamed: 0,fare,sex
sum,28693.9493,
mean,32.204208,
count,,891.0


### Tuple

In [11]:
df.agg(fare_sum=('fare', 'sum'),
       fare_mean=('fare', 'mean'),
       sex_count=('sex', 'count'))


Unnamed: 0,fare,sex
fare_sum,28693.9493,
fare_mean,32.204208,
sex_count,,891.0


## Groupby
### Basic math

In [15]:
agg_func_math = {
    'fare':
    ['sum', 'mean', 'median', 'min', 'max', 'std', 'var', 'mad', 'prod']
}
df.groupby(['embark_town']).agg(agg_func_math).round(2)


Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,sum,mean,median,min,max,std,var,mad,prod
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Cherbourg,10072.3,59.95,29.7,4.01,512.33,83.91,7041.39,53.02,6.193716e+250
Queenstown,1022.25,13.28,7.75,6.75,90.0,14.19,201.3,7.87,6.4586709999999994e+78
Southampton,17439.4,27.08,13.0,0.0,263.0,35.89,1287.95,21.3,0.0


Use describe to run multiple built-in aggregations at once:

In [18]:
agg_func_describe = {'fare': ['describe']}
df.groupby(['embark_town']).agg(agg_func_describe).round(2)


Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Cherbourg,168.0,59.95,83.91,4.01,13.7,29.7,78.5,512.33
Queenstown,77.0,13.28,14.19,6.75,7.75,7.75,15.5,90.0
Southampton,644.0,27.08,35.89,0.0,8.05,13.0,27.9,263.0


### Counting

In [21]:
agg_func_count = {'embark_town': ['count', 'nunique', 'size']}
df.groupby(['deck']).agg(agg_func_count)

Unnamed: 0_level_0,embark_town,embark_town,embark_town
Unnamed: 0_level_1,count,nunique,size
deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,15,2,15
B,45,2,47
C,59,3,59
D,33,2,33
E,32,3,32
F,13,3,13
G,4,1,4


### First and last
Select highest and lowest fare by embarked town (need to sort first to have first and last pick max and min values).

In [22]:
agg_func_selection = {'fare': ['first', 'last']}
df.sort_values(by=['fare'],
            ascending=False).groupby(['embark_town'
                                        ]).agg(agg_func_selection)


Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,first,last
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,512.3292,4.0125
Queenstown,90.0,6.75
Southampton,263.0,0.0


Instead use idxmax and idxmin to select values that correspond to max and min:

In [23]:
agg_func_max_min = {'fare': ['idxmax', 'idxmin']}
df.groupby(['embark_town']).agg(agg_func_max_min)


Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,idxmax,idxmin
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,258,378
Queenstown,245,143
Southampton,27,179


In [24]:
df.loc[[258, 378]]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True


In [26]:
df.loc[df.groupby('class')['fare'].idxmax()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
72,0,2,male,21.0,0,0,73.5,S,Second,man,True,,Southampton,no,True
159,0,3,male,,8,2,69.55,S,Third,man,True,,Southampton,no,False


### Other libraries

In [27]:
from scipy.stats import skew, mode
agg_func_stats = {'fare': [skew, mode, pd.Series.mode]}
df.groupby(['embark_town']).agg(agg_func_stats)


Unnamed: 0_level_0,fare,fare,fare
Unnamed: 0_level_1,skew,mode,mode
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cherbourg,3.305112,"([7.2292], [15])",7.2292
Queenstown,4.265111,"([7.75], [30])",7.75
Southampton,3.640276,"([8.05], [43])",8.05


### Working with text

In [28]:
agg_func_text = {'deck': [ 'nunique', mode, set]}
df.groupby(['class']).agg(agg_func_text)

Unnamed: 0_level_0,deck,deck,deck
Unnamed: 0_level_1,nunique,mode,set
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,5,"([C], [59])","{nan, E, C, A, D, B}"
Second,3,"([F], [8])","{nan, E, F, D}"
Third,3,"([F], [5])","{nan, E, G, F}"


### Custom Functions
Calculate the 25th percentile of the data using four approaches.

First, partial function:

In [29]:
from functools import partial
# Use partial
q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = '25%'

In [30]:
# Define a function
def percentile_25(x):
    return x.quantile(.25)

In [31]:
# Define a lambda function
lambda_25 = lambda x: x.quantile(.25)
lambda_25.__name__ = 'lambda_25%'

In [32]:
# Use a lambda function inline
agg_func = {
    'fare': [q_25, percentile_25, lambda_25, lambda x: x.quantile(.25)]
}

df.groupby(['embark_town']).agg(agg_func).round(2)

Unnamed: 0_level_0,fare,fare,fare,fare
Unnamed: 0_level_1,25%,percentile_25,lambda_25%,<lambda_0>
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Cherbourg,13.7,13.7,13.7,13.7
Queenstown,7.75,7.75,7.75,7.75
Southampton,8.05,8.05,8.05,8.05


### Custom function examples
Count number of null values:

In [34]:
def count_nulls(s):
    return s.size - s.count()

Include NaN values in unique counts:

In [35]:
def unique_nan(s):
    return s.nunique(dropna=False)

Summary of all values together:

In [36]:
agg_func_custom_count = {
    'embark_town': ['count', 'nunique', 'size', unique_nan, count_nulls, set]
}
df.groupby(['deck']).agg(agg_func_custom_count)

Unnamed: 0_level_0,embark_town,embark_town,embark_town,embark_town,embark_town,embark_town
Unnamed: 0_level_1,count,nunique,size,unique_nan,count_nulls,set
deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,15,2,15,2,0,"{Southampton, Cherbourg}"
B,45,2,47,3,2,"{nan, Southampton, Cherbourg}"
C,59,3,59,3,0,"{Queenstown, Southampton, Cherbourg}"
D,33,2,33,2,0,"{Southampton, Cherbourg}"
E,32,3,32,3,0,"{Queenstown, Southampton, Cherbourg}"
F,13,3,13,3,0,"{Queenstown, Southampton, Cherbourg}"
G,4,1,4,1,0,{Southampton}


To calculate the 90th percentile, use quantile:

In [54]:
def percentile_90(x):
    return x.quantile(.9)

For trimmed mean where lowest 10th percent is excluded, use scipy status function:

In [52]:
from scipy.stats import trim_mean
def trim_mean_10(x):
    return trim_mean(x, 0.1)

For largest value, regardless of sort order:

In [53]:
def largest(x):
    return x.nlargest(1)

Incorporate [sparklines](https://pbpython.com/styling-pandas.html):

In [57]:
from sparklines import sparklines
import numpy as np
def sparkline_str(x):
    bins=np.histogram(x)[0]
    sl = ''.join(sparklines(bins))
    return sl

All put together:

In [58]:
agg_func_largest = {
    'fare': [percentile_90, trim_mean_10, largest, sparkline_str]
}
df.groupby(['class', 'embark_town']).agg(agg_func_largest)

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,fare,fare
Unnamed: 0_level_1,Unnamed: 1_level_1,percentile_90,trim_mean_10,largest,sparkline_str
class,embark_town,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
First,Cherbourg,227.525,85.408335,512.3292,█▇▂▁▃▁▁▁▁▂
First,Queenstown,90.0,90.0,90.0,▁▁▁▁▁█▁▁▁▁
First,Southampton,152.315,60.50016,263.0,▃█▄▃▂▂▁▁▂▂
Second,Cherbourg,41.5792,25.1675,41.5792,█▄▁▁▄▂▄▁▄▅
Second,Queenstown,12.35,12.35,12.35,▁▁▁▁▁█▁▁▁▁
Second,Southampton,31.75,18.202273,73.5,▂█▂▅▁▂▁▁▁▁
Third,Cherbourg,19.0229,10.677941,22.3583,▁█▃▂▁▄▃▁▂▂
Third,Queenstown,24.06,9.670476,29.125,█▁▁▂▁▁▁▂▁▂
Third,Southampton,31.275,11.501469,69.55,▁█▂▂▂▁▁▁▁▁


Get total fares for top 10 and bottom 10:

In [59]:
def top_10_sum(x):
    return x.nlargest(10).sum()

def bottom_10_sum(x):
    return x.nsmallest(10).sum()


agg_func_top_bottom_sum = {
    'fare': [top_10_sum, bottom_10_sum]
}
df.groupby('class').agg(agg_func_top_bottom_sum)

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,top_10_sum,bottom_10_sum
class,Unnamed: 1_level_2,Unnamed: 2_level_2
First,3361.2584,108.3709
Second,622.2376,42.0
Third,656.3374,36.1291


### Custom functions with multiple columns
Use groupby combined with apply:

In [60]:
def summary(x):
    result = {
        'fare_sum': x['fare'].sum(),
        'fare_mean': x['fare'].mean(),
        'fare_range': x['fare'].max() - x['fare'].min()
    }
    return pd.Series(result).round(0)

df.groupby(['class']).apply(summary)

Unnamed: 0_level_0,fare_sum,fare_mean,fare_range
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,18177.0,84.0,512.0
Second,3802.0,21.0,74.0
Third,6715.0,14.0,70.0


## Working with group objects
Figure what percentage of total fares sold can be attributed to each embark_town and class combination (using assign and lambda function to add a pct_total column):

In [61]:
df.groupby(['embark_town', 'class']).agg({
    'fare': 'sum'
}).assign(pct_total=lambda x: x / x.sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,pct_total
embark_town,class,Unnamed: 2_level_1,Unnamed: 3_level_1
Cherbourg,First,8901.075,0.311947
Cherbourg,Second,431.0917,0.015108
Cherbourg,Third,740.1295,0.025939
Queenstown,First,180.0,0.006308
Queenstown,Second,37.05,0.001298
Queenstown,Third,805.2043,0.028219
Southampton,First,8936.3375,0.313183
Southampton,Second,3333.7,0.116833
Southampton,Third,5169.3613,0.181165


Simpler to use [pd.crosstab](https://pbpython.com/pandas-crosstab.html):

In [62]:
pd.crosstab(df['embark_town'],
            df['class'],
            values=df['fare'],
            aggfunc='sum',
            normalize=True)

class,First,Second,Third
embark_town,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cherbourg,0.311947,0.015108,0.025939
Queenstown,0.006308,0.001298,0.028219
Southampton,0.313183,0.116833,0.181165


Combine agg functions with pivot table:

In [63]:
pd.pivot_table(data=df,
            index=['embark_town'],
            columns=['class'],
            aggfunc=agg_func_top_bottom_sum)

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,bottom_10_sum,bottom_10_sum,bottom_10_sum,top_10_sum,top_10_sum,top_10_sum
class,First,Second,Third,First,Second,Third
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Cherbourg,282.9957,172.2041,68.25,3239.3542,334.6084,196.7457
Queenstown,180.0,37.05,73.5916,180.0,37.05,264.575
Southampton,108.3709,42.0,39.6291,2237.5251,614.5,656.3374


Show cumulative total of fares by group and aggregate by town and class, then group:

In [64]:
fare_group = df.groupby(['embark_town', 'class']).agg({'fare': 'sum'})
fare_group.groupby(level=0).cumsum()

Unnamed: 0_level_0,Unnamed: 1_level_0,fare
embark_town,class,Unnamed: 2_level_1
Cherbourg,First,8901.075
Cherbourg,Second,9332.1667
Cherbourg,Third,10072.2962
Queenstown,First,180.0
Queenstown,Second,217.05
Queenstown,Third,1022.2543
Southampton,First,8936.3375
Southampton,Second,12270.0375
Southampton,Third,17439.3988


Summarize daily sales and convert to cumulative daily and quarterly view (use [pd.Grouper](https://pbpython.com/pandas-grouper-agg.html)).

Here, include total daily sales as well as cumulative quarter amount:

In [67]:
sales = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/2018_Sales_Total_v2.xlsx?raw=True')

daily_sales = sales.groupby([pd.Grouper(key='date', freq='D')
                            ]).agg(daily_sales=('ext price',
                                                'sum')).reset_index()
daily_sales['quarter_sales'] = daily_sales.groupby(
    pd.Grouper(key='date', freq='Q')).agg({'daily_sales': 'cumsum'})


Group daily results, then group by quarter and use cumulative sum:

In [68]:
sales.groupby([pd.Grouper(key='date', freq='D')
            ]).agg(daily_sales=('ext price', 'sum')).groupby(
                pd.Grouper(freq='Q')).agg({
                    'daily_sales': 'cumsum'
                }).rename(columns={'daily_sales': 'quarterly_sales'})

Unnamed: 0_level_0,quarterly_sales
date,Unnamed: 1_level_1
2018-01-01,6766.16
2018-01-02,8318.07
2018-01-03,12597.03
2018-01-04,18641.13
2018-01-05,20613.07
...,...
2018-12-27,480817.47
2018-12-28,484389.92
2018-12-29,489227.01
2018-12-30,494106.67


## Flattening Hierarchical Column Indices

In [69]:
df.groupby(['embark_town', 'class']).agg({'fare': ['sum', 'mean']}).round(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
embark_town,class,Unnamed: 2_level_2,Unnamed: 3_level_2
Cherbourg,First,8901.0,105.0
Cherbourg,Second,431.0,25.0
Cherbourg,Third,740.0,11.0
Queenstown,First,180.0,90.0
Queenstown,Second,37.0,12.0
Queenstown,Third,805.0,11.0
Southampton,First,8936.0,70.0
Southampton,Second,3334.0,20.0
Southampton,Third,5169.0,15.0


In [70]:
multi_df = df.groupby(['embark_town', 'class'],
                    as_index=False).agg({'fare': ['sum', 'mean']})

multi_df.columns = [
'_'.join(col).rstrip('_') for col in multi_df.columns.values
]

## Subtotals
Add a subtotal using the [sidetable](https://github.com/chris1610/sidetable) package.

In [71]:
import sidetable
df.groupby(['class', 'embark_town', 'sex']).agg({'fare': 'sum'}).stb.subtotal()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fare
class,embark_town,sex,Unnamed: 3_level_1
First,Cherbourg,female,4972.5333
First,Cherbourg,male,3928.5417
First,Cherbourg,First | Cherbourg - subtotal,8901.075
First,Queenstown,female,90.0
First,Queenstown,male,90.0
First,Queenstown,First | Queenstown - subtotal,180.0
First,Southampton,female,4753.2917
First,Southampton,male,4183.0458
First,Southampton,First | Southampton - subtotal,8936.3375
First,First - subtotal,,18017.4125
