In [62]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [63]:
import pandas as pd
import numpy as np

In [64]:
df = pd.DataFrame(
    {
        'key1': ['a', 'a', 'b', 'b', 'a'],
        'key2': ['one', 'two', 'one', 'two', 'one'],
        'data1': np.random.randn(5),
        'data2': np.random.randn(5)
    }
)
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.691283,1.844076
1,a,two,-0.865315,-0.989124
2,b,one,-0.722888,1.228297
3,b,two,2.241242,-0.985697
4,a,one,-1.053173,0.712


In [65]:
# 'grouped' is GroupBy object
grouped_i = df['data1'].groupby(by = df['key1'])
grouped_i

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000018D4A2AC208>

In [66]:
# If we want to compute the mean of colimn 'data1' using labels from 'key1'
grouped_mean_i = grouped_i.mean()
grouped_mean_i

key1
a   -0.869924
b    0.759177
Name: data1, dtype: float64

In [67]:
# If we had passed multiple arrays as list, we'd get something different
grouped_ii = df['data1'].groupby(by = [df['key1'], df['key2']])
grouped_ii

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000018D4A57BB00>

In [68]:
grouped_mean_ii = grouped_ii.mean()
grouped_mean_ii

key1  key2
a     one    -0.872228
      two    -0.865315
b     one    -0.722888
      two     2.241242
Name: data1, dtype: float64

In [69]:
grouped_mean_ii.unstack(level = -1)

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.872228,-0.865315
b,-0.722888,2.241242


In [70]:
# Group keys could be any arrays of right length
states = np.array(['ohio', 'california', 'california', 'ohio', 'ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [71]:
grouped_iii = df['data1'].groupby([states, years]).mean()
grouped_iii

california  2005   -0.865315
            2006   -0.722888
ohio        2005    0.774980
            2006   -1.053173
Name: data1, dtype: float64

In [72]:
# Grouping information is found in the same DataFrame as the data you want to work on
# We may pass column names (wheather those are string, number, or other python object)
# as group keys
grouped_iv = df.groupby(by = 'key1').mean()
grouped_iv.mean()

data1   -0.055373
data2    0.321809
dtype: float64

In [73]:
grouped_v = df.groupby(by = ['key1', 'key2']).mean()
grouped_v.mean()

data1   -0.054797
data2    0.132879
dtype: float64

In [None]:
# A generally useful GroupBy method is size, which returns a series containing group sizes
df.groupby(by = ['key1', 'key2']).size()