In [1]:
# Discretization also called as binning or bucketing
# Discretization is a means of slicing up continuous data into a set of "bins", 
#where each bin represents a range of the continuous sample and the items are then placed into the appropriate bin
# hence the term "binning".

# discretization basically helps reducing data from from a large domain of numeric values to a subset of categorical values
# So it a process of  putting values into buckets so that there are a limited number of possible states

# This is a nececessay because many data mining systems or alogriths (Microsoft Naive Bayes algorithm) cannot handle continoous attributes. 
# Furthermore, even if a data mining task can handle a continuous attribute its performance can be significantly
# improved by replacing a continuous attribute with its discretized values. Learning menthods show remarkable imporvements with discrete data
# Also some decision tree-based algorithms produce shorter, more compact, and accurate results when using discrete values
# However any discretization process generally leads to a loss of information, 
# making the minimization of such information loss is the main goal of a discretizer.
# Related articles
# https://www.slideshare.net/hadooring/data-discretization

#### References
###### https://www.safaribooksonline.com/library/view/learning-pandas/9781783985128/ch09s05.html
###### https://www.slideshare.net/hadooring/data-discretization
###### https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut

In [3]:
import pandas as pd
import numpy as np
numbers=np.random.seed(30)


In [6]:
factors=np.random.randn(30)
factors

array([-1.10014381, -1.31564409,  0.81050091, -1.04477837, -0.68922937,
       -0.85275686,  1.11347211, -1.92116972, -0.70140242,  0.84908785,
        1.05069979, -0.53687446, -0.02919272,  1.39405598, -0.39786744,
        1.01912738,  1.8988813 , -0.35484964, -0.38701647, -1.92730539,
        0.55075659,  0.95435581,  0.7827765 , -1.7932379 ,  1.2418555 ,
        1.70774688,  0.3231534 , -0.99183439, -1.16328237, -0.0213351 ])

## Pandas.cut

In [12]:
#cuts the data into five bins evenly spread across the values in the sample
bins=pd.cut(factors,5) 
bins

[(-1.162, -0.397], (-1.931, -1.162], (0.368, 1.134], (-1.162, -0.397], (-1.162, -0.397], ..., (1.134, 1.899], (-0.397, 0.368], (-1.162, -0.397], (-1.931, -1.162], (-0.397, 0.368]]
Length: 30
Categories (5, object): [(-1.931, -1.162] < (-1.162, -0.397] < (-0.397, 0.368] < (0.368, 1.134] < (1.134, 1.899]]

In [None]:
#The resulting bins object is a type of pandas variable known as Categorical. 
#A categorical variable that is a result of pd.cut() consists of a set of labels and 
#an index that describes how the data has been split

In [14]:
#The .categories property will return the index and describe the intervals that pandas decided upon:
bins.categories()

Index(['(-1.931, -1.162]', '(-1.162, -0.397]', '(-0.397, 0.368]',
       '(0.368, 1.134]', '(1.134, 1.899]'],
      dtype='object')

In [15]:
#Each item in the index represents the range of values that the data has been mapped into. 
#As previously stated, the width of the bins is determined by evenly dividing the data into five equal intervals,
#with the caveat that pandas automatically increases the overall range by 0.1 percent to ensure
# that all points are included.

In [16]:
#The .codes property is an array that specifies which of the bins (intervals) each item has been assigned:
bins.codes

array([1, 0, 3, 1, 1, 1, 3, 0, 1, 3, 3, 1, 2, 4, 1, 3, 4, 2, 2, 0, 3, 3, 3,
       0, 4, 4, 2, 1, 0, 2], dtype=int8)

In [11]:
#cut will choose the bins to be evenly spaced according to the values themselves and not the frequency of those values.
pd.cut(factors,5).value_counts()

(-1.931, -1.162]    5
(-1.162, -0.397]    8
(-0.397, 0.368]     5
(0.368, 1.134]      8
(1.134, 1.899]      4
dtype: int64

In [17]:
#nstead of passing an integer number of bins to cut data into, you can pass an array of values that represent the bins. 
#A common example of this scenario is mapping ages into age range buckets. 
#The following generates 50 ages between 6 and 45:

In [48]:
np.random.seed(20)
#generates 50 ages between 6 and 45:
ages = np.random.randint(6, 45, 50)
ages

array([41, 32, 21, 37, 34, 32, 15, 26, 17, 28, 13, 40, 38, 27, 32, 32, 25,
       22, 44, 44, 22, 43, 13, 43, 40, 12, 32, 19, 17, 31,  9, 16, 17, 19,
       25, 36, 38, 16, 12, 17, 24,  9, 19, 23, 22, 24, 21, 32, 13, 17])

In [49]:
#specify specific ranges for the bins by passing them in an array 
#where the extent of each bin is specified by each set of adjacent integers

In [50]:
#The following cuts the data into the specified bins and reports the distribution of the ages 
#to each bin using the .describe() method of the pd.cut() result:

# cut into ranges and then get descriptive stats
ranges = [6, 12, 18, 35, 50]
agebins = pd.cut(ages, ranges)
agebins.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(6, 12]",4,0.08
"(12, 18]",11,0.22
"(18, 35]",24,0.48
"(35, 50]",11,0.22


In [55]:
#To specify a name for each bin that is different than the standard mathematical notation, use the labels property:
#when plotting the bins,  pandas will pass the bin names to be plotted on a chart.
ranges = [6, 12, 18, 35, 50]
labels = ['Youth', 'Young Adult', 'Adult', 'Middle Aged']
agebins = pd.cut(ages, ranges, labels=labels)
agebins.describe()


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Youth,4,0.08
Young Adult,11,0.22
Adult,24,0.48
Middle Aged,11,0.22


## Pandas.qcut

In [26]:
#Data can also be sliced according to specified quantiles using pd.qcut().
#This is the process of placing values into bins such that each bin has the same number of items. 
# To do this, the ranges of the quantiles must be determined during the process, so that the distribution is even.

In [32]:
# cut into quantiles
# 5 bins with an equal quantity of items
bins=pd.qcut(factors,5)
bins

[[-1.927, -1.0559], [-1.927, -1.0559], (0.414, 1.0254], (-1.0559, -0.453], (-1.0559, -0.453], ..., (1.0254, 1.899], (-0.453, 0.414], (-1.0559, -0.453], [-1.927, -1.0559], (-0.453, 0.414]]
Length: 30
Categories (5, object): [[-1.927, -1.0559] < (-1.0559, -0.453] < (-0.453, 0.414] < (0.414, 1.0254] < (1.0254, 1.899]]

In [33]:
bins.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"[-1.927, -1.0559]",6,0.2
"(-1.0559, -0.453]",6,0.2
"(-0.453, 0.414]",6,0.2
"(0.414, 1.0254]",6,0.2
"(1.0254, 1.899]",6,0.2


In [28]:
#qcut always has the same number of items in each bin
#in qcut the bins will be chosen so that you have the same number of records in each bin. 
#You have 30 records, so should have 6 in each bin (your output should look like this, although the breakpoints 
#will differ due to the random draw):
bins=pd.qcut(factors,5).value_counts()
bins

[-1.927, -1.0559]    6
(-1.0559, -0.453]    6
(-0.453, 0.414]      6
(0.414, 1.0254]      6
(1.0254, 1.899]      6
dtype: int64

In [35]:
np.random.seed(20)
#generates 50 ages between 6 and 45:
ages = np.random.randint(6, 45, 50)
ages

array([41, 32, 21, 37, 34, 32, 15, 26, 17, 28, 13, 40, 38, 27, 32, 32, 25,
       22, 44, 44, 22, 43, 13, 43, 40, 12, 32, 19, 17, 31,  9, 16, 17, 19,
       25, 36, 38, 16, 12, 17, 24,  9, 19, 23, 22, 24, 21, 32, 13, 17])

In [60]:
ranges = [6, 12, 18, 35,50]
labels = ['Youth', 'Young Adult', 'Adult', 'Middle Aged','Aged']
agebins = pd.qcut(ages, 5)
agebins.describe()


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"[9, 16.8]",10,0.2
"(16.8, 21.6]",10,0.2
"(21.6, 27.4]",10,0.2
"(27.4, 36.2]",10,0.2
"(36.2, 44]",10,0.2


In [41]:
# here is the slicing has happened in a such a way that it has same number of counts in each bin
agebins.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"[9, 16.8]",10,0.2
"(16.8, 21.6]",10,0.2
"(21.6, 27.4]",10,0.2
"(27.4, 36.2]",10,0.2
"(36.2, 44]",10,0.2
