In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]  # 设定箱子（区间），默认左开右闭
cats = pd.cut(ages, bins)  # pandas的cut()方法可以将一组数据按照另一组数据（箱子）放进对应的箱子中
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [3]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [4]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [5]:
pd.value_counts(cats)  # 对pandas.cut()结果中的箱子进行计数

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [6]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins, right=False)  # 将箱子改为左闭右开
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [7]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)  # 自定义箱子的名称

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [8]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)  # 只提供箱子的数量，而不提供箱子的具体范围，pd.cut()会自动平均分配箱子的大小

[(0.24, 0.47], (0.0051, 0.24], (0.24, 0.47], (0.47, 0.7], (0.24, 0.47], ..., (0.47, 0.7], (0.7, 0.94], (0.7, 0.94], (0.7, 0.94], (0.7, 0.94]]
Length: 20
Categories (4, interval[float64]): [(0.0051, 0.24] < (0.24, 0.47] < (0.47, 0.7] < (0.7, 0.94]]

In [9]:
data = np.random.randn(1000)  # 正态分布
cats = pd.qcut(data, 4)  # qcut()与cut()类似，但是不是按箱子的范围均分箱子，而是根据数据的分布，尽可能让每个箱子有相同数量的元素
cats

[(-3.06, -0.662], (-0.00751, 0.694], (-0.662, -0.00751], (-3.06, -0.662], (-0.00751, 0.694], ..., (0.694, 3.009], (-0.00751, 0.694], (-0.662, -0.00751], (-3.06, -0.662], (-0.662, -0.00751]]
Length: 1000
Categories (4, interval[float64]): [(-3.06, -0.662] < (-0.662, -0.00751] < (-0.00751, 0.694] < (0.694, 3.009]]

In [10]:
pd.value_counts(cats)

(-3.06, -0.662]       250
(-0.662, -0.00751]    250
(-0.00751, 0.694]     250
(0.694, 3.009]        250
dtype: int64

In [11]:
data = np.random.randn(1000)
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])  # qcut()也可以接收自定义的分位数来切分箱子
cats

[(-1.306, 0.0888], (-1.306, 0.0888], (-1.306, 0.0888], (-1.306, 0.0888], (0.0888, 1.339], ..., (0.0888, 1.339], (-1.306, 0.0888], (-1.306, 0.0888], (-1.306, 0.0888], (0.0888, 1.339]]
Length: 1000
Categories (4, interval[float64]): [(-3.366, -1.306] < (-1.306, 0.0888] < (0.0888, 1.339] < (1.339, 3.164]]

In [12]:
pd.value_counts(cats)

(-1.306, 0.0888]    400
(0.0888, 1.339]     400
(-3.366, -1.306]    100
(1.339, 3.164]      100
dtype: int64