# Supervised discretization using Decision Tree discretizer
## Dataset: penbased/HandwirttenDigits (large dataset)
By: Sam & Malina
Update: 02/07/22

### About Dataset

pendigits.tra: Training	7494
pendigits.tes: Testing	3498
	
The way we used the dataset was to use first half of training for  actual training, one-fourth for validation and one-fourth for writer-dependent testing. The test set was used for writer-independent testing and is the actual quality measure.

Number of Attributes: 16 input + 1 class attribute (10 classes from 0-9)
The input vector size is 2xT, two times the number of points resampled. We considered spatial resampling to T=8,12,16 points in our experiments and found that T=8 gave the best trade-off between accuracy and complexity.

No missing value, balanced class

# 1. Prepare dataset

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge
#for decision tree
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser
plt.rcParams["figure.figsize"] = [15,5]
from sklearn.preprocessing import OrdinalEncoder

In [2]:
# Read data
data0_trn = pd.read_csv('pendigits_tra.csv', header=None) #training raw data
data0_tst = pd.read_csv('pendigits_tes.csv', header=None) #testing raw data

In [3]:
data0_trn.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,47,100,27,81,57,37,26,0,0,23,56,53,100,90,40,98,8
1,0,89,27,100,42,75,29,45,15,15,37,0,69,2,100,6,2
2,0,57,31,68,72,90,100,100,76,75,50,51,28,25,16,0,1
3,0,100,7,92,5,68,19,45,86,34,100,45,74,23,67,0,4
4,0,67,49,83,100,100,81,80,60,60,40,40,33,20,47,0,1


In [4]:
col0 = pd.read_csv('pendigits_col.csv', header=None, squeeze=True) #import column name

In [5]:
data0_trn.columns = col0 # change column name
data0_tst.columns = col0

In [6]:
data0_trn.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,class
0,47,100,27,81,57,37,26,0,0,23,56,53,100,90,40,98,8
1,0,89,27,100,42,75,29,45,15,15,37,0,69,2,100,6,2
2,0,57,31,68,72,90,100,100,76,75,50,51,28,25,16,0,1
3,0,100,7,92,5,68,19,45,86,34,100,45,74,23,67,0,4
4,0,67,49,83,100,100,81,80,60,60,40,40,33,20,47,0,1


In [7]:
data0_trn['class'] = pd.Categorical(data0_trn['class']) # convert class to categorical data
data0_tst['class'] = pd.Categorical(data0_tst['class'])

In [8]:
# Check number of unique values
data0_tst.select_dtypes(include=np.number).nunique()

0
A1     101
A2      80
A3     101
A4      88
A5     101
A6     101
A7     101
A8     101
A9     101
A10    100
A11    101
A12    101
A13    101
A14    101
A15    101
A16    101
dtype: int64

In [9]:
# Check number of unique values
data0_trn.select_dtypes(include=np.number).nunique()

0
A1     101
A2      96
A3     101
A4      96
A5     101
A6     101
A7     101
A8     101
A9     101
A10    101
A11    101
A12    101
A13    101
A14    101
A15    101
A16    101
dtype: int64

In [10]:
data0 = pd.concat([data0_trn, data0_tst])
data0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10992 entries, 0 to 3497
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      10992 non-null  int64   
 1   A2      10992 non-null  int64   
 2   A3      10992 non-null  int64   
 3   A4      10992 non-null  int64   
 4   A5      10992 non-null  int64   
 5   A6      10992 non-null  int64   
 6   A7      10992 non-null  int64   
 7   A8      10992 non-null  int64   
 8   A9      10992 non-null  int64   
 9   A10     10992 non-null  int64   
 10  A11     10992 non-null  int64   
 11  A12     10992 non-null  int64   
 12  A13     10992 non-null  int64   
 13  A14     10992 non-null  int64   
 14  A15     10992 non-null  int64   
 15  A16     10992 non-null  int64   
 16  class   10992 non-null  category
dtypes: category(1), int64(16)
memory usage: 1.4 MB


In [11]:
data0.iloc[7494]==data0_tst.iloc[0] # Check: first item of tesing dataset in the merged dataset

0
A1       True
A2       True
A3       True
A4       True
A5       True
A6       True
A7       True
A8       True
A9       True
A10      True
A11      True
A12      True
A13      True
A14      True
A15      True
A16      True
class    True
Name: 0, dtype: bool

In [12]:
# Export this dataset for discretization
data0.to_csv('clean_pendigits.csv',index=False)

# 2.  Preprocess for discretization

In [13]:
#load data
pen = data0
pen

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,class
0,47,100,27,81,57,37,26,0,0,23,56,53,100,90,40,98,8
1,0,89,27,100,42,75,29,45,15,15,37,0,69,2,100,6,2
2,0,57,31,68,72,90,100,100,76,75,50,51,28,25,16,0,1
3,0,100,7,92,5,68,19,45,86,34,100,45,74,23,67,0,4
4,0,67,49,83,100,100,81,80,60,60,40,40,33,20,47,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,36,100,24,70,0,38,49,33,95,47,87,55,96,21,100,0,4
3494,16,75,41,100,52,64,32,27,0,0,21,9,62,2,100,14,2
3495,56,100,27,79,0,39,12,0,66,15,100,51,93,93,38,93,0
3496,19,100,0,61,3,23,48,0,97,27,100,66,62,97,10,81,0


In [14]:
#cast categorical class values into integer
pen = pen.astype({'class':'int64'}) 
pen.dtypes

0
A1       int64
A2       int64
A3       int64
A4       int64
A5       int64
A6       int64
A7       int64
A8       int64
A9       int64
A10      int64
A11      int64
A12      int64
A13      int64
A14      int64
A15      int64
A16      int64
class    int64
dtype: object

In [15]:
#exstract class column
y_cat = pd.DataFrame(pen['class'])
y_cat.reset_index(drop=True, inplace=True)
y_cat

Unnamed: 0,class
0,8
1,2
2,1
3,4
4,1
...,...
10987,4
10988,2
10989,0
10990,0


In [16]:
#obtain columns
num_list = pen.columns.drop('class')
#num_list = pen.columns
num_list

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16'],
      dtype='object', name=0)

# 3. Decision Tree discretization

In [17]:
#load data
data = pen
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['class'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (7694, 17)
X_test : (3298, 17)


## 3.1 DT with small max_depth

In [18]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16'],
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.013284  0.033189  0.127541  0.155546  0.003597  0.004622  0.009185   
6018  0.013284  0.026034  0.077562  0.155546  0.003597  0.000000  0.091423   
2636  0.055626  0.026034  0.127541  0.155546  0.174248  0.043328  0.091423   
5585  0.060336  0.033189  0.127541  0.048487  0.174248  0.004622  0.091423   
2188  0.060336  0.026034  0.029518  0.155546  0.021358  0.004622  0.091423   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.060336  0.033189  0.113821  0.155546  0.140887  0.213406  0.009185   
326   0.060336  0.033189  0.077562  0.155546  0.174248  0.043328  0.091423   
5466  0.311728  0.033189  0.127541  0.048487  0.140887  0.213406  0.091423   
463   0.055626  0.033189  0.127541  0.048487  0.174248  0.213406  0.454969   
1722  0.060336  0.033189  0.127541  0.048487  0.174248  0.043328  0.091423   

0           A8        A9       A10       A11       A12       A1

In [19]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
4
Entries per interval for A1
Counter({0.06033604887983707: 5567, 0.3117283950617284: 2364, 0.013284132841328414: 1918, 0.05562579013906448: 1143})
 
No of bins: A2
3
Entries per interval for A2
Counter({0.03318903318903319: 5893, 0.02603440260344026: 3115, 0.4404332129963899: 1984})
 
No of bins: A3
4
Entries per interval for A3
Counter({0.12754123513617185: 7458, 0.029517638588912886: 1954, 0.07756232686980609: 1043, 0.11382113821138211: 537})
 
No of bins: A4
4
Entries per interval for A4
Counter({0.04848682069638789: 4387, 0.15554645927138763: 3489, 0.13941176470588235: 2431, 0.0794979079497908: 685})
 
No of bins: A5
4
Entries per interval for A5
Counter({0.17424798239178282: 3875, 0.14088669950738916: 2950, 0.02135815991237678: 2599, 0.0035971223021582736: 1568})
 
No of bins: A6
4
Entries per interval for A6
Counter({0.2134059008879977: 5007, 0.004622496147919877: 2814, 0.043327556325823226: 1630, 0.0: 1541})
 
No of bins: A7
4
Entries per interval for A7
Counter(

In [20]:
intervals = pd.DataFrame(columns = ['intervals'])
for i in disc: 
    k= disc[i].nunique()
    intervals.loc[len(intervals)] = k
intervals.to_csv('intervals_pen_DT_small.csv',index=False)

In [21]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pen.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_pen.csv',index=False)

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.013284  0.033189  0.127541  0.155546  0.003597  0.004622  0.009185   
6018  0.013284  0.026034  0.077562  0.155546  0.003597  0.000000  0.091423   
2636  0.055626  0.026034  0.127541  0.155546  0.174248  0.043328  0.091423   
5585  0.060336  0.033189  0.127541  0.048487  0.174248  0.004622  0.091423   
2188  0.060336  0.026034  0.029518  0.155546  0.021358  0.004622  0.091423   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.060336  0.033189  0.113821  0.155546  0.140887  0.213406  0.009185   
326   0.060336  0.033189  0.077562  0.155546  0.174248  0.043328  0.091423   
5466  0.311728  0.033189  0.127541  0.048487  0.140887  0.213406  0.091423   
463   0.055626  0.033189  0.127541  0.048487  0.174248  0.213406  0.454969   
1722  0.060336  0.033189  0.127541  0.048487  0.174248  0.043328  0.091423   

0           A8        A9       A10       A11       A12       A1

## 3.2 DT with medium max_depth

In [22]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16'],
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.016058  0.007625  0.169659  0.127479  0.003597  0.000000  0.008299   
6018  0.016058  0.026034  0.078825  0.127479  0.003597  0.000000  0.066529   
2636  0.055626  0.026034  0.096115  0.166955  0.200745  0.071713  0.105055   
5585  0.074951  0.053402  0.096115  0.048487  0.200745  0.007335  0.066529   
2188  0.032520  0.026034  0.029563  0.166955  0.004132  0.007335  0.105055   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.074951  0.053402  0.114068  0.166955  0.122613  0.393939  0.008299   
326   0.032520  0.007625  0.078825  0.166955  0.115566  0.021472  0.105055   
5466  0.311728  0.007625  0.096115  0.048487  0.122613  0.160430  0.105055   
463   0.055626  0.053402  0.096115  0.048487  0.200745  0.160430  0.258065   
1722  0.032520  0.007625  0.096115  0.048487  0.200745  0.021472  0.105055   

0           A8        A9       A10       A11       A12       A1

In [23]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
6
Entries per interval for A1
Counter({0.0749514563106796: 3634, 0.3117283950617284: 2364, 0.032520325203252036: 1933, 0.05562579013906448: 1143, 0.016058394160583942: 975, 0.010447761194029851: 943})
 
No of bins: A2
5
Entries per interval for A2
Counter({0.053402239448751075: 3334, 0.02603440260344026: 3115, 0.007625272331154684: 2559, 0.2980132450331126: 1070, 0.6111111111111112: 914})
 
No of bins: A3
8
Entries per interval for A3
Counter({0.09611520428667113: 4246, 0.1696588868940754: 3212, 0.02956298200514139: 1105, 0.07882534775888717: 930, 0.029459901800327332: 849, 0.11406844106463879: 373, 0.11320754716981132: 164, 0.06666666666666667: 113})
 
No of bins: A4
7
Entries per interval for A4
Counter({0.04848682069638789: 4387, 0.1669545192861255: 2481, 0.15959741193386054: 1993, 0.1274787535410765: 1008, 0.078125: 451, 0.04854368932038835: 438, 0.08227848101265822: 234})
 
No of bins: A5
7
Entries per interval for A5
Counter({0.2007454739084132: 2723, 0.02757078986

In [24]:
intervals = pd.DataFrame(columns = ['intervals'])
for i in disc: 
    k= disc[i].nunique()
    intervals.loc[len(intervals)] = k
intervals.to_csv('intervals_pen_DT_medium.csv',index=False)

In [25]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pen.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_pen.csv',index=False)

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.016058  0.007625  0.169659  0.127479  0.003597  0.000000  0.008299   
6018  0.016058  0.026034  0.078825  0.127479  0.003597  0.000000  0.066529   
2636  0.055626  0.026034  0.096115  0.166955  0.200745  0.071713  0.105055   
5585  0.074951  0.053402  0.096115  0.048487  0.200745  0.007335  0.066529   
2188  0.032520  0.026034  0.029563  0.166955  0.004132  0.007335  0.105055   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.074951  0.053402  0.114068  0.166955  0.122613  0.393939  0.008299   
326   0.032520  0.007625  0.078825  0.166955  0.115566  0.021472  0.105055   
5466  0.311728  0.007625  0.096115  0.048487  0.122613  0.160430  0.105055   
463   0.055626  0.053402  0.096115  0.048487  0.200745  0.160430  0.258065   
1722  0.032520  0.007625  0.096115  0.048487  0.200745  0.021472  0.105055   

0           A8        A9       A10       A11       A12       A1

## 3.3 DT with large max_depth

In [26]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16'],
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.021944  0.009752  0.155417  0.139037  0.003597  0.000000  0.010941   
6018  0.021944  0.026034  0.075975  0.123314  0.003597  0.000000  0.073243   
2636  0.055626  0.026034  0.100917  0.186027  0.217454  0.070833  0.086556   
5585  0.065431  0.099129  0.093766  0.048487  0.217454  0.014458  0.073243   
2188  0.030547  0.026034  0.034483  0.125683  0.006024  0.014458  0.086556   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.065431  0.099129  0.120482  0.186027  0.079602  0.393939  0.010941   
326   0.034200  0.004237  0.075975  0.125683  0.101322  0.022099  0.086556   
5466  0.311728  0.009752  0.093766  0.048487  0.079602  0.164677  0.177285   
463   0.055626  0.023504  0.100917  0.048487  0.152083  0.164677  0.258065   
1722  0.030547  0.009752  0.100917  0.048487  0.217454  0.022099  0.086556   

0           A8        A9       A10       A11       A12       A1

In [27]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
10
Entries per interval for A1
Counter({0.06543138390272148: 2439, 0.3117283950617284: 2364, 0.09433962264150944: 1195, 0.05562579013906448: 1143, 0.03419972640218878: 1042, 0.03054662379421222: 891, 0.012437810945273632: 570, 0.01092896174863388: 515, 0.0219435736677116: 460, 0.007462686567164179: 373})
 
No of bins: A2
9
Entries per interval for A2
Counter({0.02603440260344026: 3115, 0.023504273504273504: 2022, 0.00975177304964539: 1560, 0.09912854030501089: 1312, 0.00423728813559322: 999, 0.6233766233766234: 887, 0.24385245901639344: 706, 0.3970037453183521: 364, 0.07142857142857142: 27})
 
No of bins: A3
14
Entries per interval for A3
Counter({0.09376558603491272: 2858, 0.15541740674955595: 1631, 0.18421052631578946: 1581, 0.10091743119266056: 1388, 0.034482758620689655: 953, 0.07597535934291581: 687, 0.034: 687, 0.12048192771084337: 350, 0.0875: 243, 0.0: 175, 0.009009009009009009: 162, 0.10989010989010989: 143, 0.06666666666666667: 113, 0.13333333333333333: 21})
 


In [28]:
intervals = pd.DataFrame(columns = ['intervals'])
for i in disc: 
    k= disc[i].nunique()
    intervals.loc[len(intervals)] = k
intervals.to_csv('intervals_pen_DT_large.csv',index=False)

In [29]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pen.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_pen.csv',index=False)

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.021944  0.009752  0.155417  0.139037  0.003597  0.000000  0.010941   
6018  0.021944  0.026034  0.075975  0.123314  0.003597  0.000000  0.073243   
2636  0.055626  0.026034  0.100917  0.186027  0.217454  0.070833  0.086556   
5585  0.065431  0.099129  0.093766  0.048487  0.217454  0.014458  0.073243   
2188  0.030547  0.026034  0.034483  0.125683  0.006024  0.014458  0.086556   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.065431  0.099129  0.120482  0.186027  0.079602  0.393939  0.010941   
326   0.034200  0.004237  0.075975  0.125683  0.101322  0.022099  0.086556   
5466  0.311728  0.009752  0.093766  0.048487  0.079602  0.164677  0.177285   
463   0.055626  0.023504  0.100917  0.048487  0.152083  0.164677  0.258065   
1722  0.030547  0.009752  0.100917  0.048487  0.217454  0.022099  0.086556   

0           A8        A9       A10       A11       A12       A1

## 3.4 DT with extra large max_depth

In [30]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16'],
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.017921  0.010934  0.115234  0.159574  0.003597  0.000000  0.008108   
6018  0.050000  0.026034  0.042735  0.113801  0.003597  0.000000  0.061412   
2636  0.055626  0.026034  0.106944  0.189781  0.236589  0.101695  0.100452   
5585  0.066964  0.067213  0.077911  0.048487  0.196721  0.008772  0.061412   
2188  0.032258  0.026034  0.034483  0.103896  0.010417  0.008772  0.077596   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.066964  0.162338  0.125000  0.189781  0.093567  0.393939  0.008108   
326   0.037258  0.002924  0.086486  0.103896  0.091371  0.025105  0.100452   
5466  0.311728  0.010934  0.077911  0.048487  0.000000  0.142447  0.139037   
463   0.055626  0.026042  0.106944  0.048487  0.158537  0.142447  0.258065   
1722  0.032258  0.010934  0.106944  0.048487  0.236589  0.025105  0.077596   

0           A8        A9       A10       A11       A12       A1

In [31]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: A1
16
Entries per interval for A1
Counter({0.3117283950617284: 2364, 0.06696428571428571: 1881, 0.05562579013906448: 1143, 0.10256410256410256: 1051, 0.037257824143070044: 957, 0.03225806451612903: 630, 0.06005221932114883: 558, 0.013477088948787063: 528, 0.010309278350515464: 404, 0.017921146953405017: 398, 0.008298755186721992: 341, 0.026595744680851064: 261, 0.0: 159, 0.037383177570093455: 144, 0.013333333333333334: 111, 0.05: 62})
 
No of bins: A2
16
Entries per interval for A2
Counter({0.02603440260344026: 3115, 0.010934393638170975: 1388, 0.026041666666666668: 1070, 0.020440251572327043: 952, 0.06721311475409836: 875, 0.2283653846153846: 592, 0.7076923076923077: 573, 0.00546448087431694: 505, 0.0029239766081871343: 494, 0.16233766233766234: 437, 0.4778761061946903: 314, 0.4421768707482993: 198, 0.0: 172, 0.3416666666666667: 166, 0.3333333333333333: 114, 0.07142857142857142: 27})
 
No of bins: A3
22
Entries per interval for A3
Counter({0.0779109589041096: 1679, 0.11589

In [32]:
intervals = pd.DataFrame(columns = ['intervals'])
for i in disc: 
    k= disc[i].nunique()
    intervals.loc[len(intervals)] = k
intervals.to_csv('intervals_pen_DT_extralarge.csv',index=False)

In [33]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pen.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_pen.csv',index=False)

0           A1        A2        A3        A4        A5        A6        A7  \
992   0.017921  0.010934  0.115234  0.159574  0.003597  0.000000  0.008108   
6018  0.050000  0.026034  0.042735  0.113801  0.003597  0.000000  0.061412   
2636  0.055626  0.026034  0.106944  0.189781  0.236589  0.101695  0.100452   
5585  0.066964  0.067213  0.077911  0.048487  0.196721  0.008772  0.061412   
2188  0.032258  0.026034  0.034483  0.103896  0.010417  0.008772  0.077596   
...        ...       ...       ...       ...       ...       ...       ...   
5643  0.066964  0.162338  0.125000  0.189781  0.093567  0.393939  0.008108   
326   0.037258  0.002924  0.086486  0.103896  0.091371  0.025105  0.100452   
5466  0.311728  0.010934  0.077911  0.048487  0.000000  0.142447  0.139037   
463   0.055626  0.026042  0.106944  0.048487  0.158537  0.142447  0.258065   
1722  0.032258  0.010934  0.106944  0.048487  0.236589  0.025105  0.077596   

0           A8        A9       A10       A11       A12       A1