In [74]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [75]:
import pandas as pd
import numpy as np

In [76]:
df = pd.DataFrame(
    {
        'key1': ['a', 'a', 'b', 'b', 'a'],
        'key2': ['one', 'two', 'one', 'two', 'one'],
        'data1': np.random.randn(5),
        'data2': np.random.randn(5)
    }
)
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.133937,-0.79782
1,a,two,-0.755495,-0.535944
2,b,one,2.355747,-0.579496
3,b,two,-0.280613,0.834519
4,a,one,-1.630666,0.878391


In [77]:
# 'grouped' is GroupBy object
grouped_i = df['data1'].groupby(by = df['key1'])
grouped_i

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000018D4A575CF8>

In [78]:
# If we want to compute the mean of colimn 'data1' using labels from 'key1'
grouped_mean_i = grouped_i.mean()
grouped_mean_i

key1
a   -0.840033
b    1.037567
Name: data1, dtype: float64

In [79]:
# If we had passed multiple arrays as list, we'd get something different
grouped_ii = df['data1'].groupby(by = [df['key1'], df['key2']])
grouped_ii

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000018D4A581710>

In [80]:
grouped_mean_ii = grouped_ii.mean()
grouped_mean_ii

key1  key2
a     one    -0.882301
      two    -0.755495
b     one     2.355747
      two    -0.280613
Name: data1, dtype: float64

In [81]:
grouped_mean_ii.unstack(level = -1)

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.882301,-0.755495
b,2.355747,-0.280613


In [82]:
# Group keys could be any arrays of right length
states = np.array(['ohio', 'california', 'california', 'ohio', 'ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [83]:
grouped_iii = df['data1'].groupby([states, years]).mean()
grouped_iii

california  2005   -0.755495
            2006    2.355747
ohio        2005   -0.207275
            2006   -1.630666
Name: data1, dtype: float64

In [84]:
# Grouping information is found in the same DataFrame as the data you want to work on
# We may pass column names (wheather those are string, number, or other python object)
# as group keys
grouped_iv = df.groupby(by = 'key1')
grouped_iv.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.840033,-0.151791
b,1.037567,0.127512


In [85]:
grouped_v = df.groupby(by = ['key1', 'key2'])
grouped_v.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.882301,0.040285
a,two,-0.755495,-0.535944
b,one,2.355747,-0.579496
b,two,-0.280613,0.834519


In [86]:
# A generally useful GroupBy method is size, which returns a series containing group sizes
grouped_size_v = grouped_v.size()
grouped_size_v

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64