In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

# 1. Discretizing into equal-sized bins

open and closed intervals

In [3]:
df = pd.DataFrame({"age": [2, 67, 40, 32, 4, 15, 82, 99, 26, 30]})
df["age_group"] = pd.cut(df["age"], 3)
df

Unnamed: 0,age,age_group
0,2,"(1.903, 34.333]"
1,67,"(66.667, 99.0]"
2,40,"(34.333, 66.667]"
3,32,"(1.903, 34.333]"
4,4,"(1.903, 34.333]"
5,15,"(1.903, 34.333]"
6,82,"(66.667, 99.0]"
7,99,"(66.667, 99.0]"
8,26,"(1.903, 34.333]"
9,30,"(1.903, 34.333]"


# 2. Adding custom bins

In [4]:
df["age_group"] = pd.cut(df["age"], bins=[0, 12, 19, 61, 100])
df.sort_values("age_group")

Unnamed: 0,age,age_group
0,2,"(0, 12]"
4,4,"(0, 12]"
5,15,"(12, 19]"
2,40,"(19, 61]"
3,32,"(19, 61]"
8,26,"(19, 61]"
9,30,"(19, 61]"
1,67,"(61, 100]"
6,82,"(61, 100]"
7,99,"(61, 100]"


In [5]:
df.age_group.value_counts().sort_index()

(0, 12]      2
(12, 19]     1
(19, 61]     4
(61, 100]    3
Name: age_group, dtype: int64

# 3. Adding labels to bins

In [6]:
bins = [0, 12, 19, 61, 100]
labels = ["<12", "Teen", "Adult", "Older"]
df["age_group"] = pd.cut(df["age"], bins, labels=labels)
df.sort_values("age_group")

Unnamed: 0,age,age_group
0,2,<12
4,4,<12
5,15,Teen
2,40,Adult
3,32,Adult
8,26,Adult
9,30,Adult
1,67,Older
6,82,Older
7,99,Older


# 4. Configuring leftmost edge with right=False
Indicates whether bins includes the rightmost edge or not. To include the leftmost edge, we can set right=False:

In [7]:
pd.cut(df["age"], bins=[0, 12, 19, 61, 100], right=False)

0      [0, 12)
1    [61, 100)
2     [19, 61)
3     [19, 61)
4      [0, 12)
5     [12, 19)
6    [61, 100)
7    [61, 100)
8     [19, 61)
9     [19, 61)
Name: age, dtype: category
Categories (4, interval[int64, left]): [[0, 12) < [12, 19) < [19, 61) < [61, 100)]

# 5. Including the lowest value with `include_lowest=True`
Whether the first interval should be left-inclusive or not.

In [8]:
df["age_group"] = pd.cut(df["age"], bins=[2, 12, 19, 61, 100])
df

Unnamed: 0,age,age_group
0,2,
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(2.0, 12.0]"
5,15,"(12.0, 19.0]"
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


In [9]:
df["age_group"] = pd.cut(df["age"], bins=[2, 12, 19, 61, 100], include_lowest=True)
df

Unnamed: 0,age,age_group
0,2,"(1.999, 12.0]"
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(1.999, 12.0]"
5,15,"(12.0, 19.0]"
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


# 6. Using an IntervalIndex to cut

In [10]:
bins = pd.IntervalIndex.from_tuples([(0, 12), (19, 61), (61, 100)])
bins

IntervalIndex([(0, 12], (19, 61], (61, 100]], dtype='interval[int64, right]')

In [11]:
df["age_group"] = pd.cut(df["age"], bins)
df

Unnamed: 0,age,age_group
0,2,"(0.0, 12.0]"
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(0.0, 12.0]"
5,15,
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


In [12]:
result, bins = pd.cut(df["age"], bins=4, retbins=True)
result, bins

(0    (1.903, 26.25]
 1     (50.5, 74.75]
 2     (26.25, 50.5]
 3     (26.25, 50.5]
 4    (1.903, 26.25]
 5    (1.903, 26.25]
 6     (74.75, 99.0]
 7     (74.75, 99.0]
 8    (1.903, 26.25]
 9     (26.25, 50.5]
 Name: age, dtype: category
 Categories (4, interval[float64, right]): [(1.903, 26.25] < (26.25, 50.5] < (50.5, 74.75] < (74.75, 99.0]],
 array([ 1.903, 26.25 , 50.5  , 74.75 , 99.   ]))

# 8. Creating unordered categories

In [13]:
pd.cut(
    df["age"],
    bins=[0, 12, 19, 61, 100],
    labels=["<12", "Teen", "Adult", "Older"],
    ordered=False,
)

0      <12
1    Older
2    Adult
3    Adult
4      <12
5     Teen
6    Older
7    Older
8    Adult
9    Adult
Name: age, dtype: category
Categories (4, object): ['<12', 'Teen', 'Adult', 'Older']

In [14]:
pd.cut(
    df["age"],
    bins=[0, 12, 19, 61, 100],
    labels=["<12", "Teen", "Adult", "Older"],
)

0      <12
1    Older
2    Adult
3    Adult
4      <12
5     Teen
6    Older
7    Older
8    Adult
9    Adult
Name: age, dtype: category
Categories (4, object): ['<12' < 'Teen' < 'Adult' < 'Older']