In [1]:
'''
-----------------------------------------------------------------------
qcut
-----------------------------------------------------------------------
'''
import pandas as pd

In [2]:
sr = pd.Series([1,12,5,1,9,3,4,10,8])

In [3]:
'''
-------------------------------------------------------------------------------------
The part displayed with ( means that the boundary value is not included as an area, 
and ] means that the boundary value is included as an area.
------------------------------------------------------------------------------------
'''
pd.qcut(sr, 3)

0    (0.999, 3.667]
1     (8.333, 12.0]
2    (3.667, 8.333]
3    (0.999, 3.667]
4     (8.333, 12.0]
5    (0.999, 3.667]
6    (3.667, 8.333]
7     (8.333, 12.0]
8    (3.667, 8.333]
dtype: category
Categories (3, interval[float64]): [(0.999, 3.667] < (3.667, 8.333] < (8.333, 12.0]]

In [4]:
'''
------------------------------------------------------------------------------------------------
If you use value_counts function, you can display the number of cases for each divided section.
-----------------------------------------------------------------------------------------------
'''
pd.qcut(sr, 3).value_counts()

(8.333, 12.0]     3
(3.667, 8.333]    3
(0.999, 3.667]    3
dtype: int64

In [5]:
'''
------------------------------------------------------------------------------------------------
Since the values are classified into three categories, try displaying them with the labels 'low', 'middle', and 'high' respectively.
------------------------------------------------------------------------------------------------
'''
pd.qcut(sr, 3, labels=['low','middle','high'])

0       low
1      high
2    middle
3       low
4      high
5       low
6    middle
7      high
8    middle
dtype: category
Categories (3, object): [low < middle < high]

In [6]:
'''
----------------------------------------------------------------------------------------------
f you want to hide the label, set labels = False to display only the number of the division from the bottom.
--------------------------------------------------------------------------------------------
'''
pd.qcut(sr, 3, labels=False) # Display only numbers.

0    0
1    2
2    1
3    0
4    2
5    0
6    1
7    2
8    1
dtype: int64

In [7]:
'''
--------------------------------------------------------------------------------------------------
If retbins = True, boundary values can be acquired separately.
-------------------------------------------------------------------------------------------------
'''
sr_cut, bins = pd.qcut(sr, 3, retbins=True)

In [8]:
sr_cut

0    (0.999, 3.667]
1     (8.333, 12.0]
2    (3.667, 8.333]
3    (0.999, 3.667]
4     (8.333, 12.0]
5    (0.999, 3.667]
6    (3.667, 8.333]
7     (8.333, 12.0]
8    (3.667, 8.333]
dtype: category
Categories (3, interval[float64]): [(0.999, 3.667] < (3.667, 8.333] < (8.333, 12.0]]

In [9]:
bins

array([ 1.        ,  3.66666667,  8.33333333, 12.        ])

In [10]:
'''
-------------------------------------------------------------------------------------------------
You can specify how many decimal places to consider with the precision argument.
-------------------------------------------------------------------------------------------------
'''
pd.qcut(sr, 3, precision=1) # 小数点以下1桁のみ

0     (0.9, 3.7]
1    (8.3, 12.0]
2     (3.7, 8.3]
3     (0.9, 3.7]
4    (8.3, 12.0]
5     (0.9, 3.7]
6     (3.7, 8.3]
7    (8.3, 12.0]
8     (3.7, 8.3]
dtype: category
Categories (3, interval[float64]): [(0.9, 3.7] < (3.7, 8.3] < (8.3, 12.0]]

In [11]:
pd.qcut(sr,3,precision=4) # 小数点以下4桁

0    (0.9999, 3.6667]
1      (8.3333, 12.0]
2    (3.6667, 8.3333]
3    (0.9999, 3.6667]
4      (8.3333, 12.0]
5    (0.9999, 3.6667]
6    (3.6667, 8.3333]
7      (8.3333, 12.0]
8    (3.6667, 8.3333]
dtype: category
Categories (3, interval[float64]): [(0.9999, 3.6667] < (3.6667, 8.3333] < (8.3333, 12.0]]

In [12]:
'''
--------------------------------------------------------------------------------------------------------
cut
--------------------------------------------------------------------------------------------------------
'''
age_list = pd.Series([0,20,32,21,15,40,12,35,32,39,24,58,57,11,52,54,19])

In [13]:
age_list

0      0
1     20
2     32
3     21
4     15
5     40
6     12
7     35
8     32
9     39
10    24
11    58
12    57
13    11
14    52
15    54
16    19
dtype: int64

In [14]:
pd.cut(age_list, bins=[-1,19,39,59])

0     (-1, 19]
1     (19, 39]
2     (19, 39]
3     (19, 39]
4     (-1, 19]
5     (39, 59]
6     (-1, 19]
7     (19, 39]
8     (19, 39]
9     (19, 39]
10    (19, 39]
11    (39, 59]
12    (39, 59]
13    (-1, 19]
14    (39, 59]
15    (39, 59]
16    (-1, 19]
dtype: category
Categories (3, interval[int64]): [(-1, 19] < (19, 39] < (39, 59]]

In [15]:
'''
--------------------------------------------------------------------------------
As with the qcut function, you can also specify a label here. Let's label it as 'young', 'young-adult', 'adult'.
---------------------------------------------------------------------------------
'''
pd.cut(age_list,bins=[-1,19,39,59],labels=['young','young-adult','adult'])

0           young
1     young-adult
2     young-adult
3     young-adult
4           young
5           adult
6           young
7     young-adult
8     young-adult
9     young-adult
10    young-adult
11          adult
12          adult
13          young
14          adult
15          adult
16          young
dtype: category
Categories (3, object): [young < young-adult < adult]

In [16]:
'''
---------------------------------------------------------------------------------
If you set labels = False, you can see what section is in the bottom.
---------------------------------------------------------------------------------
'''
pd.cut(age_list,bins=[-1,19,39,59],labels=False)

0     0
1     1
2     1
3     1
4     0
5     2
6     0
7     1
8     1
9     1
10    1
11    2
12    2
13    0
14    2
15    2
16    0
dtype: int64

In [17]:
'''
------------------------------------------------------------------------------------------
Change the direction of the open section of the region.

The area specified when dividing the data is a half-open section, 
the right section is closed (including the boundary as an area), 
and the left section is open (not including the boundary as an area). .

If you want to reverse this, specify right = False (right = True by default).
----------------------------------------------------------------------------------------
'''
pd.cut(age_list,bins=[-1,19,39,59],right=False) # right=Falseにする

0     [-1, 19)
1     [19, 39)
2     [19, 39)
3     [19, 39)
4     [-1, 19)
5     [39, 59)
6     [-1, 19)
7     [19, 39)
8     [19, 39)
9     [39, 59)
10    [19, 39)
11    [39, 59)
12    [39, 59)
13    [-1, 19)
14    [39, 59)
15    [39, 59)
16    [19, 39)
dtype: category
Categories (3, interval[int64]): [[-1, 19) < [19, 39) < [39, 59)]

In [18]:
pd.cut(age_list,bins=[-1,19,39,59],right=False).value_counts() # 人数の内訳を比較

[19, 39)    7
[39, 59)    6
[-1, 19)    4
dtype: int64

In [19]:
pd.cut(age_list,bins=[-1,19,39,59],right=True).value_counts() # 人数の内訳を比較

(19, 39]    7
(39, 59]    5
(-1, 19]    5
dtype: int64

In [21]:
'''
----------------------------------------------------------------------------------------------
In this way, you can see that the result is different from the previous one because the way of setting the section was changed. 
In this case, it is necessary to change the way of specifying the interval with bins. 
Next, specify the value so that the minimum value for each interval becomes the boundary value.
----------------------------------------------------------------------------------------------
'''
bins = [0,20,40,60]
pd.cut(age_list,bins=bins,right=False).value_counts()

[20, 40)    7
[40, 60)    5
[0, 20)     5
dtype: int64

In [22]:
'''
------------------------------------------------------------------------------------------------
Grouping along the divided area
--------------------------------------------------------------------------------------------------
'''
import numpy as np

In [26]:
score_list = np.random.randint(0,100,size=17) # 整数の乱数を17個生成

In [27]:
score_list

array([72, 33, 31, 87, 65, 72,  6, 44, 24, 70,  2, 62, 40, 88, 26, 57, 20])

In [28]:
df = pd.DataFrame({'age':age_list,'score':score_list})

In [29]:
df

Unnamed: 0,age,score
0,0,72
1,20,33
2,32,31
3,21,87
4,15,65
5,40,72
6,12,6
7,35,44
8,32,24
9,39,70


In [30]:
cut = pd.qcut(df['age'], 4)

In [31]:
df.groupby(cut).mean() # 分割した領域ごとの平均を求める

Unnamed: 0_level_0,age,score
age,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-0.001, 19.0]",11.4,50.2
"(19.0, 32.0]",25.8,35.4
"(32.0, 40.0]",38.0,62.0
"(40.0, 58.0]",55.25,46.25


In [36]:
'''
------------------------------------------------------------
You can do the same with the cut function.
------------------------------------------------------------
'''
cut = pd.cut(df['age'],bins=[-1,19,31,39,59], labels=['low', 'middle-low', 'middle-high', 'high'])

In [37]:
df.groupby(cut).mean()

Unnamed: 0_level_0,age,score
age,Unnamed: 1_level_1,Unnamed: 2_level_1
low,11.4,50.2
middle-low,21.666667,40.666667
middle-high,34.5,42.25
high,52.2,51.4
