# Unsupervised discretization
Dataset: satimage

Updated at: 26 June 22

By: Sam

### About Dataset
NUMBER OF ATTRIBUTES: 36 (= 4 spectral bands x 9 pixels in neighbourhood) the pixels read out in sequence left-to-right and top-to-bottom. 

    - A1-A4: 4 top-left
    - A5-A8: 4 top middle
    - A9-A12: 4 top-right
    => central pixel are given by attributes 17,18,19 and 20

NUMBER OF EXAMPLES:

	- training set     4435
	- test set         2000
    
ATTRIBUTES: The attributes are numerical, in the range 0 to 255.
CLASS: 
	There are 6 decision classes: 1,2,3,4,5 and 7.

!!! NB. There are no examples with class 6 in this dataset-they have all been removed because of doubts about the 
	validity of this class.
    
!!! NB. DO NOT USE CROSS-VALIDATION WITH THIS DATASET !!!
- Just train and test only once with the above training and test sets.
- The data is given in random order and certain lines of data have been removed so you cannot reconstruct the original image from this dataset.

In [1]:
# Load library
import pandas as pd
import numpy as np
import time
import timeit

In [2]:
from sklearn.preprocessing import KBinsDiscretizer as kbins # also use for unsupervised

In [3]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd

In [4]:
# Load dataset
data = pd.read_csv('clean_satimage.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [6]:
# Convert outcome to categorical
data['class'] = pd.Categorical(data['class'])

In [7]:
# get list of numeric attributes to discretize
num_col = data.select_dtypes(include=np.number).columns
num_col = num_col.tolist()

In [8]:
num_col

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'A6',
 'A7',
 'A8',
 'A9',
 'A10',
 'A11',
 'A12',
 'A13',
 'A14',
 'A15',
 'A16',
 'A17',
 'A18',
 'A19',
 'A20',
 'A21',
 'A22',
 'A23',
 'A24',
 'A25',
 'A26',
 'A27',
 'A28',
 'A29',
 'A30',
 'A31',
 'A32',
 'A33',
 'A34',
 'A35',
 'A36']

## Equal Width Discretization

In [9]:
# Define function: Inputs: dataset, number of parameters

def ewd_disc(data, k):
    ## set up the discretisation transformer
    ewd_disc = ewd(bins=k, variables=num_col, return_boundaries=False)
    '''
    Parameters
    ----------
    bins : int, default=10
        Desired number of equal width intervals / bins.

    variables : list
        The list of numerical variables to transform. If None, the
        discretiser will automatically select all numerical type variables.

    return_object : bool, default=False
        Whether the numbers in the discrete variable should be returned as
        numeric or as object. The decision should be made by the user based on
        whether they would like to proceed the engineering of the variable as
        if it was numerical or categorical.

    return_boundaries: bool, default=False
        whether the output should be the interval boundaries. If True, it returns
        the interval boundaries. If False, it returns integers.
    '''
    ## fit the transformer
    ewd_disc.fit(data)
    ## transform the data
    data_ewd = ewd_disc.transform(data)
    ## binner_dict contains the boundaries of the different bins: 
    # stores the interval limits identified for each variable
    ewd_disc.binner_dict_
    return data_ewd  # return dataset after discretization

### EWD - Scenario 1: k = 4

In [10]:
# Perform discretization
k = 4
start = time.time() # Starting  time
data_ewd1 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":",ewd_t) # Total time execution

Discretization time, EWD, k =  4 : 0.09081077575683594


In [11]:
# OUTPUT:
data_ewd1.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A28,A29,A30,A31,A32,A33,A34,A35,A36,class
0,3,3,3,2,2,2,2,1,2,2,...,2,3,3,3,2,2,3,2,1,3
1,2,2,2,1,2,2,2,1,2,2,...,2,2,3,2,1,2,2,2,1,3
2,2,2,2,1,2,2,2,1,2,2,...,1,2,2,2,1,2,2,2,1,3
3,2,2,2,1,2,2,2,1,2,2,...,1,2,2,2,1,2,2,2,1,3
4,2,2,2,1,2,2,2,1,2,2,...,1,2,2,2,1,2,3,2,1,3


In [12]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd1.groupby(col)[col].count())

A1
A1
0    1036
1    2908
2    1641
3     850
Name: A1, dtype: int64
A2
A2
0     818
1    1979
2    2899
3     739
Name: A2, dtype: int64
A3
A3
0     535
1    2218
2    2859
3     823
Name: A3, dtype: int64
A4
A4
0    1040
1    3930
2    1121
3     344
Name: A4, dtype: int64
A5
A5
0    1058
1    2937
2    1617
3     823
Name: A5, dtype: int64
A6
A6
0     838
1    2002
2    2885
3     710
Name: A6, dtype: int64
A7
A7
0     370
1    2542
2    2944
3     579
Name: A7, dtype: int64
A8
A8
0     802
1    4175
2    1162
3     296
Name: A8, dtype: int64
A9
A9
0    1251
1    2891
2    1811
3     482
Name: A9, dtype: int64
A10
A10
0     738
1    1685
2    2743
3    1269
Name: A10, dtype: int64
A11
A11
0     383
1    2564
2    2931
3     557
Name: A11, dtype: int64
A12
A12
0     802
1    4195
2    1138
3     300
Name: A12, dtype: int64
A13
A13
0    1054
1    2922
2    1627
3     832
Name: A13, dtype: int64
A14
A14
0     818
1    2015
2    2867
3     735
Name: A14, dtype: int64
A15
A15
0     368
1

### EWD - Scenario 2: k = 7

In [13]:
# Perform discretization
k = 7
start = time.time() # Starting  time
data_ewd2 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  7 : 0.10009288787841797


In [14]:
# OUTPUT:
data_ewd2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A28,A29,A30,A31,A32,A33,A34,A35,A36,class
0,5,5,5,3,4,4,4,2,4,5,...,4,5,6,5,3,4,5,4,3,3
1,4,4,4,2,4,4,3,2,4,5,...,3,4,5,4,3,4,4,3,2,3
2,4,4,3,2,4,4,3,2,4,4,...,3,4,4,3,2,4,4,3,2,3
3,4,4,3,2,4,4,3,2,4,4,...,2,4,4,3,2,4,5,3,2,3
4,4,4,3,2,4,4,3,2,4,5,...,2,4,5,3,2,4,5,4,3,3


In [15]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd2.groupby(col)[col].count())

A1
A1
0     484
1     843
2    1293
3    1887
4     937
5     854
6     137
Name: A1, dtype: int64
A2
A2
0     526
1     425
2     939
3    1667
4    1557
5    1294
6      27
Name: A2, dtype: int64
A3
A3
0      41
1     725
2    1395
3    1272
4    1905
5     913
6     184
Name: A3, dtype: int64
A4
A4
0      66
1    1369
2    2167
3    2231
4     202
5     296
6     104
Name: A4, dtype: int64
A5
A5
0     489
1     861
2    1324
3    1877
4     920
5     838
6     126
Name: A5, dtype: int64
A6
A6
0     536
1     433
2     964
3    1658
4    1546
5    1281
6      17
Name: A6, dtype: int64
A7
A7
0      28
1     749
2    1410
3    1557
4    1675
5     942
6      74
Name: A7, dtype: int64
A8
A8
0      35
1    1218
2    2357
3    2221
4     223
5     285
6      96
Name: A8, dtype: int64
A9
A9
0     564
1     803
2    1854
3    1374
4     916
5     810
6     114
Name: A9, dtype: int64
A10
A10
0     536
1     385
2     799
3    1564
4    1419
5    1616
6     116
Name: A10, dtype: int64
A11
A11

### EWD - Scenario 3: k = 10

In [16]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_ewd3 = ewd_disc(data, k)
end = time.time()
ewd_t = end - start
print("Discretization time, EWD, k = ", k,":", ewd_t) # Total time execution

Discretization time, EWD, k =  10 : 0.12206888198852539


In [17]:
# OUTPUT:
data_ewd3.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A28,A29,A30,A31,A32,A33,A34,A35,A36,class
0,8,7,7,5,6,6,5,3,6,7,...,5,7,9,8,5,6,7,6,4,3
1,6,6,6,3,6,6,5,4,6,7,...,5,6,7,6,4,6,6,5,3,3
2,6,6,5,4,6,6,5,3,6,6,...,4,6,6,5,3,6,6,5,3,3
3,6,6,5,3,6,6,5,3,6,6,...,3,6,6,5,3,6,7,5,3,3
4,6,6,5,3,6,6,5,3,6,7,...,3,6,7,5,3,6,7,6,4,3


In [18]:
## OUTPUT: Check number of instance in each interval in the data_ewd
# With equal width discretisation, each bin does not necessarily 
# contain the same number of observations.
for col in num_col:
    print(col)
    print(data_ewd3.groupby(col)[col].count())

A1
A1
0     247
1     539
2     541
3    1120
4    1497
5     692
6     751
7     648
8     376
9      24
Name: A1, dtype: int64
A2
A2
0     485
1     149
2     381
3     647
4    1135
5    1049
6    1268
7    1179
8     136
9       6
Name: A2, dtype: int64
A3
A3
0      13
1     194
2     805
3     811
4     930
5    1156
6    1053
7    1063
8     331
9      79
Name: A3, dtype: int64
A4
A4
0      30
1     382
2    1202
3    1620
4    1736
5     930
6     125
7     186
8     169
9      55
Name: A4, dtype: int64
A5
A5
0     259
1     545
2     546
3    1146
4    1499
5     681
6     736
7     644
8     356
9      23
Name: A5, dtype: int64
A6
A6
0     490
1     159
2     389
3     659
4    1143
5    1038
6    1259
7    1164
8     128
9       6
Name: A6, dtype: int64
A7
A7
0       8
1     149
2     767
3     997
4     991
5    1130
6    1307
7     882
8     179
9      25
Name: A7, dtype: int64
A8
A8
0       9
1     169
2    1287
3    1600
4    1912
5     920
6     135
7     198
8     180
9

## Equal Frequency Discretization - EFD
- Reference: https://nbviewer.org/github/feature-engine/feature-engine-examples/blob/main/discretisation/EqualFrequencyDiscretiser.ipynb
- Parameter:
- q : int, default=10
    Desired number of equal frequency intervals / bins. In other words the
    number of quantiles in which the variables should be divided.

- variables : list
    The list of numerical variables that will be discretised. If None, the
    EqualFrequencyDiscretiser() will select all numerical variables.

- return_object : bool, default=False
    Whether the numbers in the discrete variable should be returned as
    numeric or as object. The decision is made by the user based on
    whether they would like to proceed the engineering of the variable as
    if it was numerical or categorical.

- return_boundaries: bool, default=False
    whether the output should be the interval boundaries. If True, it returns
    the interval boundaries. If False, it returns integers.

In [19]:
def efd_disc(data, k):
    ## set up the discretisation transformer
    efd_disc = efd(q=k, variables=num_col)
    ## fit the transformer
    efd_disc.fit(data)
    ## transform the data
    data_efd = efd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    efd_disc.binner_dict_
    return data_efd

### Define function efd_disc, inputs include dataset, number of intervals (k)

### EFD - Scenario 1: k = 4

In [20]:
# Perform discretization
k = 4
start = time.time() # Starting time
data_efd1 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":", efd_t) # Total time execution

Discretization time, EFD, k =  4 : 0.11689996719360352


In [21]:
## OUTPUT: Check number of instance in each interval 
for col in num_col:
    print(col)
    print(data_efd1.groupby(col)[col].count())

A1
A1
0    1699
1    1725
2    1502
3    1509
Name: A1, dtype: int64
A2
A2
0    1662
1    1706
2    1663
3    1404
Name: A2, dtype: int64
A3
A3
0    1624
1    1714
2    1624
3    1473
Name: A3, dtype: int64
A4
A4
0    1614
1    1620
2    1736
3    1465
Name: A4, dtype: int64
A5
A5
0    1736
1    1751
2    1476
3    1472
Name: A5, dtype: int64
A6
A6
0    1697
1    1540
2    1815
3    1383
Name: A6, dtype: int64
A7
A7
0    1654
1    1702
2    1621
3    1458
Name: A7, dtype: int64
A8
A8
0    1648
1    1598
2    1731
3    1458
Name: A8, dtype: int64
A9
A9
0    1750
1    1471
2    1633
3    1581
Name: A9, dtype: int64
A10
A10
0    1720
1    1564
2    1555
3    1596
Name: A10, dtype: int64
A11
A11
0    1688
1    1564
2    1754
3    1429
Name: A11, dtype: int64
A12
A12
0    1629
1    1652
2    1716
3    1438
Name: A12, dtype: int64
A13
A13
0    1715
1    1720
2    1514
3    1486
Name: A13, dtype: int64
A14
A14
0    1648
1    1579
2    1816
3    1392
Name: A14, dtype: int64
A15
A15
0    1622
1

### EFD - Scenario 2: k = 7

In [22]:
# Perform discretization
k = 7
start = time.time() # Starting time
data_efd2 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  7 : 0.11752176284790039


In [23]:
## OUTPUT
data_efd2.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

### Scenario 3: k = 10

In [24]:
# Perform discretization
k = 10
start = time.time() # Starting time
data_efd3 = efd_disc(data, k)
end = time.time()
efd_t = end - start
print("Discretization time, EFD, k = ", k,":",efd_t) # Total time execution

Discretization time, EFD, k =  10 : 0.11288309097290039


In [25]:
## OUTPUT
data_efd3.info()
## OUTPUT: Check number of instance in each interval in the data_efd
for col in num_col:
    print(col)
    print(data_efd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

## Fixed Frequency Discretization - FFD

### Define function ffd_disc: modify input of function efd
Input include dataset, interval frequency (m)

In [26]:
def ffd_disc(data, m): # 
    n = len(data)
    ## set up the discretisation transformer
    ffd_disc = efd(q=round(n/m), variables=num_col) # number of bins = n/m
    ## fit the transformer
    ffd_disc.fit(data)
    ## transform the data
    data_ffd = ffd_disc.transform(data)
    ## binner_dict_ stores the interval limits identified for each variable.
    ffd_disc.binner_dict_
    return data_ffd

### FFD - Scenario 1: m = 10

In [27]:
# Perform discretization
m = 10
start = time.time() # Starting time
data_ffd1 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD,  m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD,  m =  10 : 0.3172290325164795


In [28]:
## OUTPUT
data_ffd1.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd1.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

### FFD - Scenario 1: m = 30

In [29]:
# Perform discretization
m = 30
start = time.time() # Starting time
data_ffd2 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, EFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, EFD, m =  30 : 0.20836710929870605


In [30]:
## OUTPUT
data_ffd2.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd2.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

### FFD - Scenario 3: m = 60

In [31]:
# Perform discretization
m = 60
start = time.time() # Starting time
data_ffd3 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  60 : 0.19794797897338867


In [32]:
## OUTPUT
data_ffd3.info()
## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd3.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

#### FFD, m = 100

In [33]:
# Perform discretization
m = 100
start = time.time() # Starting time
data_ffd4 = ffd_disc(data, m)
end = time.time()
ffd_t = end - start
print("Discretization time, FFD, m = ", m, ":", ffd_t) # Total time execution

Discretization time, FFD, m =  100 : 0.16847515106201172


In [34]:
## OUTPUT
data_ffd4.info()

## OUTPUT: Check number of instance in each interval
for col in num_col:
    print(col)
    print(data_ffd4.groupby(col)[col].count())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A1      6435 non-null   int64   
 1   A2      6435 non-null   int64   
 2   A3      6435 non-null   int64   
 3   A4      6435 non-null   int64   
 4   A5      6435 non-null   int64   
 5   A6      6435 non-null   int64   
 6   A7      6435 non-null   int64   
 7   A8      6435 non-null   int64   
 8   A9      6435 non-null   int64   
 9   A10     6435 non-null   int64   
 10  A11     6435 non-null   int64   
 11  A12     6435 non-null   int64   
 12  A13     6435 non-null   int64   
 13  A14     6435 non-null   int64   
 14  A15     6435 non-null   int64   
 15  A16     6435 non-null   int64   
 16  A17     6435 non-null   int64   
 17  A18     6435 non-null   int64   
 18  A19     6435 non-null   int64   
 19  A20     6435 non-null   int64   
 20  A21     6435 non-null   int64   
 21  A22     6435 n

### Export discretized datasets

In [35]:
# EWD datasets:
data_ewd1.to_csv('satimage_ewd1.csv', index = False) # k=4
data_ewd2.to_csv('satimage_ewd2.csv', index = False) # k=7
data_ewd3.to_csv('satimage_ewd3.csv', index = False) # k=10

In [36]:
# EFD datasets:
data_efd1.to_csv('satimage_efd1.csv', index = False) # k=4
data_efd2.to_csv('satimage_efd2.csv', index = False) # k=7
data_efd3.to_csv('satimage_efd3.csv', index = False) # k=10


In [37]:
# FFD datasets:
data_ffd1.to_csv('satimage_ffd1.csv', index = False) # m=10
data_ffd2.to_csv('satimage_ffd2.csv', index = False) # m=30
data_ffd3.to_csv('satimage_ffd3.csv', index = False) # m=60
data_ffd4.to_csv('satimage_ffd4.csv', index = False) # m=100