In [1]:
# link used for below task
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

# advanced
# https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html#cookbook-grouping

In [2]:
from IPython.display import display
import pandas as pd
import numpy as np

In [33]:
# sample dataframe

df = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

In [34]:
df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [5]:
grouped1 = df.groupby('class')

In [6]:
list(grouped1) # grouped objects 

[('bird',
         class           order  max_speed
  falcon  bird   Falconiformes      389.0
  parrot  bird  Psittaciformes       24.0),
 ('mammal',
            class      order  max_speed
  lion     mammal  Carnivora       80.2
  monkey   mammal   Primates        NaN
  leopard  mammal  Carnivora       58.0)]

In [7]:
[(k, g.shape) for k, g in grouped1 ] # shows internal details of a grouped object

[('bird', (2, 3)), ('mammal', (3, 3))]

In [8]:
[(type(k), type(g)) for k, g in grouped1 ] # grouped name, dataframe

[(str, pandas.core.frame.DataFrame), (str, pandas.core.frame.DataFrame)]

In [9]:
grouped2 = df.groupby("order", axis="columns")

In [10]:
list(grouped2)

[]

In [11]:
grouped3 = df.groupby(["class", "order"])

In [12]:
df

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [13]:
[(k, g.shape) for k, g in grouped3 ]

[(('bird', 'Falconiformes'), (1, 3)),
 (('bird', 'Psittaciformes'), (1, 3)),
 (('mammal', 'Carnivora'), (2, 3)),
 (('mammal', 'Primates'), (1, 3))]

In [14]:
list(grouped3)[2][1]

Unnamed: 0,class,order,max_speed
lion,mammal,Carnivora,80.2
leopard,mammal,Carnivora,58.0


In [26]:
lst = [1, 2, 3, 1, 2, 3]

s = pd.Series([1, 2, 3, 10, 20, 30], lst)

In [27]:
s

1     1
2     2
3     3
1    10
2    20
3    30
dtype: int64

In [28]:
grouped = s.groupby(level=0)

In [29]:
list(grouped)

[(1,
  1     1
  1    10
  dtype: int64),
 (2,
  2     2
  2    20
  dtype: int64),
 (3,
  3     3
  3    30
  dtype: int64)]

In [30]:
grouped.first()

1    1
2    2
3    3
dtype: int64

In [31]:
grouped.last()

1    10
2    20
3    30
dtype: int64

In [32]:
grouped.groups

{1: [1, 1], 2: [2, 2], 3: [3, 3]}

In [37]:
df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)})

In [38]:
grouped = df.groupby('A')

In [39]:
df

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3
4,2,4
5,2,5
6,3,6
7,3,7


In [40]:
for k, g in grouped:
    display(k)
    display(g)

1

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3


2

Unnamed: 0,A,B
4,2,4
5,2,5


3

Unnamed: 0,A,B
6,3,6
7,3,7


In [42]:
grouped.get_group(1)

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3


In [43]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]


index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])

df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [44]:
grouped = df.groupby(['A'])

In [45]:
grouped.agg(np.sum)

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,6
2,9
3,13


## Named Aggregations

In [46]:
animals = pd.DataFrame(
    {
        "kind": ["cat", "dog", "cat", "dog"],
        "height": [9.1, 6.0, 9.5, 34.0],
        "weight": [7.9, 7.5, 9.9, 198.0],
    }
)

In [47]:
animals

Unnamed: 0,kind,height,weight
0,cat,9.1,7.9
1,dog,6.0,7.5
2,cat,9.5,9.9
3,dog,34.0,198.0


In [48]:
animals.groupby('kind').size()

kind
cat    2
dog    2
dtype: int64

In [52]:
animals.groupby('kind').agg(
    min_height = pd.NamedAgg(column='height', aggfunc='min'),
    comb = pd.NamedAgg(column='weight', aggfunc=lambda x: list(x))
).reset_index()

Unnamed: 0,kind,min_height,comb
0,cat,9.1,"[7.9, 9.9]"
1,dog,6.0,"[7.5, 198.0]"


In [56]:
# or
animals.groupby('kind').agg(
    **{
        "comb" : pd.NamedAgg(column='weight', aggfunc=lambda x: list(x))
    }
)


Unnamed: 0_level_0,comb
kind,Unnamed: 1_level_1
cat,"[7.9, 9.9]"
dog,"[7.5, 198.0]"


## Function Application

In [58]:
# Tablewise Function Application: pipe()

# Row or Column-wise Function Application: apply()

# Aggregation API: agg() and transform()

# Applying Elementwise Functions: applymap()

In [71]:
df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

In [72]:
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [73]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df

In [75]:
extract_city_name(df_p)

Unnamed: 0,city_and_code,city_name
0,"Chicago, IL",Chicago


In [76]:
add_country_name(df_p, ' India')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago India


In [77]:
add_country_name(extract_city_name(df_p), " US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago US


In [81]:
df_p.pipe(extract_city_name).pipe(add_country_name, "AUS")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoAUS


## Row or column-wise function application

In [83]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)

In [84]:
df

Unnamed: 0,one,two,three
a,-0.885275,-0.254004,
b,-0.11562,-1.360653,0.73742
c,0.403267,-1.042946,-0.422659
d,,-0.472302,0.261289


In [86]:
def app_mean(x):
    display(x)
    return np.mean

In [88]:
df.apply(lambda x: app_mean(x)) # column wise operation

a   -0.885275
b   -0.115620
c    0.403267
d         NaN
Name: one, dtype: float64

a   -0.254004
b   -1.360653
c   -1.042946
d   -0.472302
Name: two, dtype: float64

a         NaN
b    0.737420
c   -0.422659
d    0.261289
Name: three, dtype: float64

one      <function mean at 0x018D6A48>
two      <function mean at 0x018D6A48>
three    <function mean at 0x018D6A48>
dtype: object

In [89]:
df.apply(lambda x: app_mean(x), axis=1)

one     -0.885275
two     -0.254004
three         NaN
Name: a, dtype: float64

one     -0.115620
two     -1.360653
three    0.737420
Name: b, dtype: float64

one      0.403267
two     -1.042946
three   -0.422659
Name: c, dtype: float64

one           NaN
two     -0.472302
three    0.261289
Name: d, dtype: float64

a    <function mean at 0x018D6A48>
b    <function mean at 0x018D6A48>
c    <function mean at 0x018D6A48>
d    <function mean at 0x018D6A48>
dtype: object