How to use group and access item in the result (multiple index included)

In [1]:
import pandas as pd
pd.options.display.width = 1000

# init a sample meaningful dataframe that have class and student name, age, and score columns
df = pd.DataFrame({
    'class': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen'],
    'age': [20, 21, 27, 23, 24, 25, 25, 27],
    'score': [80, 81, 82, 83, 84, 85, 86, 87]
})

print(df.sample(3))

g_sum = df.groupby('class').sum()
print("type of g_sum:", type(g_sum))
print(g_sum.head())
# access the sum of score of class A
print("sum of score of class A:", g_sum.loc['A', 'score'])

# group by class and age
g_sum = df.groupby(['class', 'age']).sum()
print(g_sum.head())
# access the sum of score of class A and age 20
print("sum of score of class A and age 20:", g_sum.loc[('A', 20), 'score'])  # using tuple to access multi-index



  class     name  age  score
3     A    David   23     83
7     B    Helen   27     87
2     A  Charlie   27     82
type of g_sum: <class 'pandas.core.frame.DataFrame'>
       age  score
class            
A       91    326
B      101    342
sum of score of class A: 326
           score
class age       
A     20      80
      21      81
      23      83
      27      82
B     24      84
sum of score of class A and age 20: 80


Get the group item

In [6]:
print(df.sample(3))

grouped = df.groupby('class')
print("type of grouped:", type(grouped))

g= grouped.groups
print("type of g:", type(g)) # dict
print(g)

# get a specific group
print("group A:")
group_a = grouped.get_group('A')
print("type of group_a:", type(group_a)) # dataframe
print(grouped.get_group('A')) # all row data that have class = A

  class   name  age  score
5     B  Frank   25     85
6     B  Grace   25     86
1     A    Bob   21     81
type of grouped: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
type of g: <class 'dict'>
{'A': Int64Index([0, 1, 2, 3], dtype='int64'), 'B': Int64Index([4, 5, 6, 7], dtype='int64')}
group A:
type of group_a: <class 'pandas.core.frame.DataFrame'>
  class     name  age  score
0     A    Alice   20     80
1     A      Bob   21     81
2     A  Charlie   27     82
3     A    David   23     83


agg() function accept numpy function name as parameter, it accepts multipe function as well (pass as a list)

In [3]:
import numpy as np
print(df.sample(3))

g = df.groupby("class")
# sum score of each class
# method 1, use the agg to sum the price for each group
print(g["score"].agg(np.sum))
print("---------------------------------")
# method 2, use the native function to sum the price for each group
print(g.sum()["score"])

# multipe aggregation
print(g.agg({
    "age": [np.mean, np.std],
    "score": np.sum
}))

  class   name  age  score
0     A  Alice   20     80
3     A  David   23     83
6     B  Grace   25     86
class
A    326
B    342
Name: score, dtype: int64
---------------------------------
class
A    326
B    342
Name: score, dtype: int64
         age           score
        mean       std   sum
class                       
A      22.75  3.095696   326
B      25.25  1.258306   342


nth element. To select the nth item from a DataFrame or Series, use nth().  
It’s a very useful function that will return a single row per group if you pass an integer for n.  
It’s quite useful if you combine it with the sort() function. For example, you can use this to get the third-highest price products in different groups.

In [4]:
g= df.groupby("class")
print("Return the third element from each group")
nth_3 = g.nth(3)
print("type of nth_3:", type(nth_3)) # dataframe
print(g.nth(3))


Return the third element from each group
type of nth_3: <class 'pandas.core.frame.DataFrame'>
        name  age  score
class                   
A      David   23     83
B      Helen   27     87


Iterate the groups


In [5]:
for name, gr in g:
    print("current group is {}".format(name))
    print("type of gr:", type(gr)) # dataframe
    print("the corresponding data under group {}".format(name))
    print("------------------------------------")
    print(gr)
    print("------------------------------------")

current group is A
type of gr: <class 'pandas.core.frame.DataFrame'>
the corresponding data under group A
------------------------------------
  class     name  age  score
0     A    Alice   20     80
1     A      Bob   21     81
2     A  Charlie   27     82
3     A    David   23     83
------------------------------------
current group is B
type of gr: <class 'pandas.core.frame.DataFrame'>
the corresponding data under group B
------------------------------------
  class   name  age  score
4     B    Eva   24     84
5     B  Frank   25     85
6     B  Grace   25     86
7     B  Helen   27     87
------------------------------------
