In [4]:
import numpy as np
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

<img src="img/panda_aggregations.png" style="width:375px;float:left">
<img src="img/numpy_aggregations.png" style="width:500px;float:left">

In [5]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(1, 7)}, columns=['key', 'data'])
df

Unnamed: 0,key,data
0,A,1
1,B,2
2,C,3
3,A,4
4,B,5
5,C,6


In [469]:
val = df.mean()
val = df.mean(axis=0)
val = df.mean(axis="rows")
print(val)
print("type={0}".format(type(val)))
print("======")
print(df['data'].mean())
print("type={0}".format(type(df['data'].mean())))
print("======")
df_test = df.copy()
df_test['data2'] = df_test['data'] + 1
val = df_test.mean(axis=1)
val = df_test.mean(axis="columns")
print(val)
print("type={0}".format(type(val)))

data    3.5
dtype: float64
type=<class 'pandas.core.series.Series'>
3.5
type=<class 'numpy.float64'>
0    1.5
1    2.5
2    3.5
3    4.5
4    5.5
5    6.5
dtype: float64
type=<class 'pandas.core.series.Series'>


In [120]:
df.groupby('key').sum()
df.groupby('key')['data'].sum().to_frame()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,5
B,7
C,9


<img src="img/groupby_sum.png" style="width:500px;float:left">

In [121]:
for (key, group) in df.groupby('key'):
    print("{0} - shape={1}".format(key, group.shape))

A - shape=(2, 2)
B - shape=(2, 2)
C - shape=(2, 2)


In [470]:
df.describe()
df.dropna().describe()

Unnamed: 0,data
count,6.0
mean,3.5
std,1.870829
min,1.0
25%,2.25
50%,3.5
75%,4.75
max,6.0


In [330]:
df.groupby('key').describe()
df.groupby('key')['data'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2.0,2.5,2.12132,1.0,1.75,2.5,3.25,4.0
B,2.0,3.5,2.12132,2.0,2.75,3.5,4.25,5.0
C,2.0,4.5,2.12132,3.0,3.75,4.5,5.25,6.0


### AGGREGATION

In [8]:
df.groupby('key').aggregate(['min', np.median, max])
df.groupby('key').agg(['min', np.median, max])
df.groupby('key').agg(['min', 'median', 'max'])
df.groupby('key')['data'].agg(['min', 'median', 'max'])
df.groupby('key').agg(
    min = pd.NamedAgg(column="data", aggfunc="min"),
    median = pd.NamedAgg(column="data", aggfunc="median"),
    max = pd.NamedAgg(column="data", aggfunc="max"),    
)

Unnamed: 0_level_0,min,median,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,2.5,4
B,2,3.5,5
C,3,4.5,6


In [125]:
# aplicar las operaciones en una columna existente
df.groupby('key').agg({'data': 'min'})

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1
B,2
C,3


In [145]:
df.agg("mean", axis="rows")

data    3.5
dtype: float64

### FILTERING

In [465]:
def func(x):
    return x['data'].mean() > 4
display('df.groupby("key").mean()', 'df.groupby("key").filter(func)')

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,2.5
B,3.5
C,4.5

Unnamed: 0,key,data
2,C,3
5,C,6


### TRANSFORMATION
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transform.html

In [313]:
df_test = df.copy()

df_test['sum'] = df_test.groupby('key')['data'].transform("sum")

df_test['sum'] = df_test.groupby('key')['data'].transform(lambda x: x.sum())

def func(x):
    print(x)
    print(type(x))
    return x.sum()
df_test['sum'] = df_test.groupby('key')['data'].transform(func)

df_test["percent"] = df_test["data"] / df_test.groupby('key')["data"].transform('sum')
df_test["percent"] = df_test["data"] / df_test["sum"]

display('df_test', 'df.groupby("key")["data"].sum().to_frame()')
# 1 + 4 = 5 | 2 + 5 = 7 | 3 + 6 = 9 | ...

0    1
3    4
Name: A, dtype: int64
<class 'pandas.core.series.Series'>
1    2
4    5
Name: B, dtype: int64
<class 'pandas.core.series.Series'>
2    3
5    6
Name: C, dtype: int64
<class 'pandas.core.series.Series'>


Unnamed: 0,key,data,sum,percent
0,A,1,5,0.2
1,B,2,7,0.285714
2,C,3,9,0.333333
3,A,4,5,0.8
4,B,5,7,0.714286
5,C,6,9,0.666667

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,5
B,7
C,9


### THE APPLY METHOD
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

Apply a function along an axis of the DataFrame.

In [337]:
df_test = df.copy()
def func(x):
    print(x)
    print(type(x))
    x['sum'] = x['data'].sum()
    x['percent'] = x['data'] / x['sum']
    return x
df_test.groupby('key').apply(func)

  key  data
0   A     1
3   A     4
<class 'pandas.core.frame.DataFrame'>
  key  data
1   B     2
4   B     5
<class 'pandas.core.frame.DataFrame'>
  key  data
2   C     3
5   C     6
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,key,data,sum,percent
0,A,1,5,0.2
1,B,2,7,0.285714
2,C,3,9,0.333333
3,A,4,5,0.8
4,B,5,7,0.714286
5,C,6,9,0.666667


### SPECIFYING THE SPLIT KEY

In [359]:
df_test = df.copy()
key = [0, 1, 0, 1, 2, 0]
display('df_test', 'df_test.groupby(key)["data"].sum().to_frame()')

Unnamed: 0,key,data
0,A,1
1,B,2
2,C,3
3,A,4
4,B,5
5,C,6

Unnamed: 0,data
0,10
1,6
2,5


In [358]:
# A dictionary or series mapping index to group
df_test = df.copy()
df_test = df_test.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
display('df_test', 'df_test.groupby(mapping)["data"].sum().to_frame()')

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1
B,2
C,3
A,4
B,5
C,6

Unnamed: 0,data
consonant,16
vowel,5


In [394]:
# Any Python function
df_test = df.copy()
df_test = df_test.set_index('key')
def func(x):
    return 'vowel' if (x == 'A') else 'consonant' if (x in ['B', 'C']) else 'not found'
display('df_test', 'df_test.groupby(func)["data"].sum().to_frame()')

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1
B,2
C,3
A,4
B,5
C,6

Unnamed: 0,data
consonant,16
vowel,5


In [406]:
# Multi-index
df_test = df.copy()
df_test = df_test.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
def func(x):
    return x.lower()
display('df_test', 'df_test.groupby([func, mapping])["data"].sum().to_frame()')

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1
B,2
C,3
A,4
B,5
C,6

Unnamed: 0,Unnamed: 1,data
a,vowel,5
b,consonant,7
c,consonant,9


### GROUPING

In [425]:
df_test = df.copy()
df_test['year'] = [2001,2002,2003,2010,2011,2012]
decade = 10 * (df_test['year'] // 10)
decade = decade.astype(str) + 's'
display('df_test', 'df_test.groupby(["key", decade])["data"].sum().unstack().fillna(0)')

Unnamed: 0,key,data,year
0,A,1,2001
1,B,2,2002
2,C,3,2003
3,A,4,2010
4,B,5,2011
5,C,6,2012

year,2000s,2010s
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1,4
B,2,5
C,3,6
