# WELCH t-test
By: Sam<br>
Updated at: 07/07/2022<br>
Compare performance of discretizers <br>

===

Input data: instrinsic properties and model performance metrics
!!! **NB: Please update the data for metrics and export to csv before running this script!

In [1]:
# Import library
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Import evaluation data (updated at 04707.2022)
data = pd.read_csv("evaluation_0707.csv")

In [3]:
data.head()

Unnamed: 0,dataset,disc,param,inconsistency,models,accuracy,con_features,time_disc,time_train,bias,variance
0,iris,EWD,4,0.066667,ID3,0.84,4,0.016412,0.008698225,0.158,0.055
1,iris,EWD,7,0.02,ID3,0.79,4,0.015692,0.010634899,0.158,0.054
2,iris,EWD,10,0.006667,ID3,0.95,4,0.01638,0.010643005,0.053,0.014
3,iris,EFD,4,0.04,ID3,0.84,4,0.016688,0.009439945,0.158,0.049
4,iris,EFD,7,0.04,ID3,0.95,4,0.023941,0.010675907,0.053,0.07


In [4]:
data['disc'].unique()

array(['EWD', 'EFD', 'FFD', 'ChiMerge', 'DT'], dtype=object)

In [5]:
data.columns

Index(['dataset', 'disc', 'param', 'inconsistency', 'models', 'accuracy',
       'con_features', 'time_disc', 'time_train', 'bias', 'variance'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   dataset        162 non-null    object 
 1   disc           162 non-null    object 
 2   param          162 non-null    int64  
 3   inconsistency  162 non-null    float64
 4   models         162 non-null    object 
 5   accuracy       161 non-null    object 
 6   con_features   162 non-null    int64  
 7   time_disc      162 non-null    float64
 8   time_train     161 non-null    object 
 9   bias           118 non-null    object 
 10  variance       118 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 14.0+ KB


In [7]:
data['models'].unique()

array(['ID3', 'CNB', 'Knn-VDM'], dtype=object)

## 1. Test for accuracy (Completed)

### 1.1 Data preparation

In [8]:
pd.to_numeric(data['accuracy'], errors='coerce')

0      0.84
1      0.79
2      0.95
3      0.84
4      0.95
       ... 
157    0.95
158    0.91
159    0.90
160    0.91
161    0.88
Name: accuracy, Length: 162, dtype: float64

In [9]:
# No filter of algorithm
# Obtain accuracy for each discretization, convert into numeric, string values will be return as NaN
ewd_acc = pd.to_numeric(data[data['disc']=="EWD"]['accuracy'],errors='coerce')
efd_acc = pd.to_numeric(data[data['disc']=="EFD"]['accuracy'],errors='coerce')
ffd_acc = pd.to_numeric(data[data['disc']=="FFD"]['accuracy'],errors='coerce')
cm_acc = pd.to_numeric(data[data['disc']=="ChiMerge"]['accuracy'],errors='coerce')
dt_acc = pd.to_numeric(data[data['disc']=="DT"]['accuracy'],errors='coerce')

In [10]:
# Filter for CNB
# Obtain accuracy for each discretization, convert into numeric, string values will be return as NaN
ewd_acc_cnb = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="CNB")]['accuracy'],errors='coerce')
efd_acc_cnb = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="CNB")]['accuracy'],errors='coerce')
ffd_acc_cnb = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="CNB")]['accuracy'],errors='coerce')
cm_acc_cnb = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="CNB")]['accuracy'],errors='coerce')
dt_acc_cnb = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="CNB")]['accuracy'],errors='coerce')

In [11]:
# Filter for ID3
# Obtain accuracy for each discretization, convert into numeric, string values will be return as NaN
ewd_acc_id3 = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="ID3")]['accuracy'],errors='coerce')
efd_acc_id3 = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="ID3")]['accuracy'],errors='coerce')
ffd_acc_id3 = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="ID3")]['accuracy'],errors='coerce')
cm_acc_id3 = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="ID3")]['accuracy'],errors='coerce')
dt_acc_id3 = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="ID3")]['accuracy'],errors='coerce')

In [12]:
# Filter for KNN
# Obtain accuracy for each discretization, convert into numeric, string values will be return as NaN
ewd_acc_knn = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="Knn-VDM")]['accuracy'],errors='coerce')
efd_acc_knn = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="Knn-VDM")]['accuracy'],errors='coerce')
ffd_acc_knn = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="Knn-VDM")]['accuracy'],errors='coerce')
cm_acc_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['accuracy'],errors='coerce')
dt_acc_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['accuracy'],errors='coerce')

In [13]:
# # Try some pairs of discretization
# # ewd vs efd => statistic=-0.9250871938582758, pvalue=0.35954824133146124
# stats.ttest_ind(ewd_acc, efd_acc, nan_policy='omit', equal_var=False)
# # efd vs ffd => statistic=0.6569767787116787, pvalue=0.5141497794388237
# stats.ttest_ind(efd_acc, ffd_acc, nan_policy='omit')
# # efd vs ChiMerge => statistic=0.553172178811246, pvalue=0.5827129429964153
# stats.ttest_ind(efd_acc, cm_acc, nan_policy='omit', equal_var=False)

### 1.2  Welch t test, accuracy
Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html
Calculate the T-test for the means of two independent samples of scores.
**equal_varbool = False**: perform Welch’s t-test, which does not assume equal population variance [2].

Implement 4 replications: (DONE)
- Regardless algorithms
- Filter for each algorithm: CNB, ID3, Knn

#### 1.2.1 Welch t test, accuracy, Regardless of algorithm

In [14]:
# Regardless of algorithm
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_acc, efd_acc, ffd_acc, cm_acc, dt_acc]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [15]:
# Result table
wt_result = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result.drop_duplicates(subset=['p_value'], inplace=True)

In [16]:
wt_result

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.920422,0.362147
1,ewd vs ffd,-0.24526,0.807284
2,ewd vs cm,1.295407,0.202916
3,ewd vs dt,-0.950188,0.346351
5,efd vs ffd,0.659194,0.512792
6,efd vs cm,1.687247,0.100043
7,efd vs dt,-0.05893,0.953206
10,ffd vs cm,1.40209,0.168926
11,ffd vs dt,-0.695957,0.489258
15,cm vs dt,-1.702128,0.097036


#### 1.2.2 Welch t test, accuracy, only for CNB

In [17]:
# Regardless of algorithm
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_acc_cnb, efd_acc_cnb, ffd_acc_cnb, cm_acc_cnb, dt_acc_cnb]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [18]:
# Result table
wt_result_cnb = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_cnb.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_cnb

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.279164,0.783699
1,ewd vs ffd,0.423,0.67746
2,ewd vs cm,-0.795779,0.436792
3,ewd vs dt,-0.366705,0.719436
5,efd vs ffd,0.71884,0.481653
6,efd vs cm,-0.523752,0.606941
7,efd vs dt,-0.036363,0.971504
10,ffd vs cm,-1.255862,0.223643
11,ffd vs dt,-0.901816,0.379434
15,cm vs dt,0.591075,0.562126


#### 1.2.3 Welch t test, accuracy, only for ID3

In [19]:
# Regardless of algorithm
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_acc_id3, efd_acc_id3, ffd_acc_id3, cm_acc_id3, dt_acc_id3]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [20]:
# Result table
wt_result_id3 = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_id3.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_id3

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.574737,0.573551
1,ewd vs ffd,-1.02002,0.323795
2,ewd vs cm,-1.45667,0.166371
3,ewd vs dt,-2.091571,0.053834
5,efd vs ffd,-0.413756,0.684461
6,efd vs cm,-0.873616,0.395444
7,efd vs dt,-1.56127,0.137756
10,ffd vs cm,-0.525276,0.604656
11,ffd vs dt,-1.323224,0.19935
15,cm vs dt,-0.826704,0.417297


#### 1.2.4 Welch t test, accuracy, only for KNN

In [21]:
# Regardless of algorithm
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_acc_knn, efd_acc_knn, ffd_acc_knn, cm_acc_knn, dt_acc_knn]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [22]:
# Result table
wt_result_knn = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_knn.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_knn

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.611674,0.559909
1,ewd vs ffd,0.081786,0.937885
2,ewd vs cm,2.276337,0.046983
5,efd vs ffd,0.483759,0.657533
6,efd vs cm,2.437014,0.03717
10,ffd vs cm,2.180695,0.052755
15,cm vs dt,0.0,1.0


## 2. Test for Bias (Not complete)

### 2.1 Data preparation

In [23]:
pd.to_numeric(data['bias'], errors='coerce')

0      0.158
1      0.158
2      0.053
3      0.158
4      0.053
       ...  
157      NaN
158      NaN
159      NaN
160      NaN
161      NaN
Name: bias, Length: 162, dtype: float64

In [24]:
# No filter of algorithm
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias = pd.to_numeric(data[data['disc']=="EWD"]['bias'],errors='coerce')
efd_bias = pd.to_numeric(data[data['disc']=="EFD"]['bias'],errors='coerce')
ffd_bias = pd.to_numeric(data[data['disc']=="FFD"]['bias'],errors='coerce')
cm_bias = pd.to_numeric(data[data['disc']=="ChiMerge"]['bias'],errors='coerce')
dt_bias = pd.to_numeric(data[data['disc']=="DT"]['bias'],errors='coerce')

In [25]:
# Filter for CNB
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_cnb = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="CNB")]['bias'],errors='coerce')
efd_bias_cnb = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="CNB")]['bias'],errors='coerce')
ffd_bias_cnb = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="CNB")]['bias'],errors='coerce')
cm_bias_cnb = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="CNB")]['bias'],errors='coerce')
dt_bias_cnb = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="CNB")]['bias'],errors='coerce')

In [26]:
# Filter for ID3
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_id3 = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="ID3")]['bias'],errors='coerce')
efd_bias_id3 = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="ID3")]['bias'],errors='coerce')
ffd_bias_id3 = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="ID3")]['bias'],errors='coerce')
cm_bias_id3 = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="ID3")]['bias'],errors='coerce')
dt_bias_id3 = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="ID3")]['bias'],errors='coerce')

In [27]:
# Filter for KNN
# Obtain bias for each discretization, convert into numeric, string values will be return as NaN
ewd_bias_knn = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce')
efd_bias_knn = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce')
ffd_bias_knn = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="Knn-VDM")]['bias'],errors='coerce')
cm_bias_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['bias'],errors='coerce')
dt_bias_knn = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="Knn-VDM")]['bias'],errors='coerce')

### 2.2 Welch t-test, bias
Implement 4 replications:
- Regardless algorithms
- Filter for each algorithm: CNB, ID3, Knn

#### 2.2.1 Welch t-test, bias, regarless algorithms

In [28]:
# WELCH T-TEST
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_bias, efd_bias, ffd_bias, cm_bias, dt_bias]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [29]:
# Result table
wt_result_bias = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_bias.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_bias

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,0.53124,0.598868
1,ewd vs ffd,0.005088,0.995969
2,ewd vs cm,0.923007,0.362986
3,ewd vs dt,1.391556,0.172887
5,efd vs ffd,-0.583945,0.562191
6,efd vs cm,0.429372,0.669879
7,efd vs dt,0.986329,0.329795
10,ffd vs cm,1.025301,0.310479
11,ffd vs dt,1.524261,0.134461
15,cm vs dt,0.6129,0.543299


#### 2.2.2 Welch t-test, bias, only CNB

In [30]:
# WELCH T-TEST
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_bias_cnb, efd_bias_cnb, ffd_bias_cnb, cm_bias_cnb, dt_bias_cnb]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [31]:
# Result table
wt_result_bias_cnb = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_bias_cnb.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_bias_cnb

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,1.191505,0.263294
1,ewd vs ffd,0.076711,0.940749
2,ewd vs cm,0.928437,0.382739
3,ewd vs dt,1.004392,0.341432
5,efd vs ffd,-1.346509,0.196778
6,efd vs cm,-0.424913,0.676915
7,efd vs dt,-0.021132,0.983557
10,ffd vs cm,1.06981,0.299667
11,ffd vs dt,1.072768,0.312018
15,cm vs dt,0.319054,0.757678


#### 2.2.1 Welch t-test, bias, only ID3

In [32]:
ewd_bias_id3

0      0.158
1      0.158
2      0.053
54     0.140
55     0.115
56     0.105
108    0.050
109    0.021
110    0.023
Name: bias, dtype: float64

In [33]:
# WELCH T-TEST
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_bias_id3, efd_bias_id3, ffd_bias_id3, cm_bias_id3, dt_bias_id3]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [34]:
# Result table
wt_result_bias_id3 = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_bias_id3.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_bias_id3

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,0.757624,0.459887
1,ewd vs ffd,1.005856,0.330201
2,ewd vs cm,1.580883,0.139061
3,ewd vs dt,1.551224,0.138842
5,efd vs ffd,0.20116,0.843002
6,efd vs cm,0.766635,0.456381
7,efd vs dt,0.851207,0.405545
10,ffd vs cm,0.630486,0.535382
11,ffd vs dt,0.734285,0.470842
15,cm vs dt,0.260708,0.797246


#### 2.2.3 Welch t-test, bias, only KNN

In [35]:
# WELCH T-TEST
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_bias_knn, efd_bias_knn, ffd_bias_knn, cm_bias_knn, dt_bias_knn]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [36]:
# Result table
wt_result_bias_knn = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_bias_knn.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_bias_knn

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.31092,0.772565
1,ewd vs ffd,-0.682243,0.525449
2,ewd vs cm,0.359461,0.753656
3,ewd vs dt,-0.497313,0.640185
5,efd vs ffd,-0.47717,0.654378
6,efd vs cm,1.0,0.42265
7,efd vs dt,-0.267566,0.800117
10,ffd vs cm,1.201161,0.315873
11,ffd vs dt,0.184951,0.859369
15,cm vs dt,-0.980952,0.398954


## 3. Test for Variance (Completed)

### 3.1. Data preparation

In [37]:
pd.to_numeric(data['variance'], errors='coerce')

0      0.055
1      0.054
2      0.014
3      0.049
4      0.070
       ...  
157      NaN
158      NaN
159      NaN
160      NaN
161      NaN
Name: variance, Length: 162, dtype: float64

In [38]:
# No filter of algorithm
# Obtain var for each discretization, convert into numeric, string values will be return as NaN
ewd_var = pd.to_numeric(data[data['disc']=="EWD"]['variance'],errors='coerce')
efd_var = pd.to_numeric(data[data['disc']=="EFD"]['variance'],errors='coerce')
ffd_var = pd.to_numeric(data[data['disc']=="FFD"]['variance'],errors='coerce')
cm_var = pd.to_numeric(data[data['disc']=="ChiMerge"]['variance'],errors='coerce')
dt_var = pd.to_numeric(data[data['disc']=="DT"]['variance'],errors='coerce')

In [39]:
# Filter for CNB
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_var_cnb = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="CNB")]['variance'],errors='coerce')
efd_var_cnb = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="CNB")]['variance'],errors='coerce')
ffd_var_cnb = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="CNB")]['variance'],errors='coerce')
cm_var_cnb = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="CNB")]['variance'],errors='coerce')
dt_var_cnb = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="CNB")]['variance'],errors='coerce')

In [40]:
# Filter for ID3
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_var_id3 = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="ID3")]['variance'],errors='coerce')
efd_var_id3 = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="ID3")]['variance'],errors='coerce')
ffd_var_id3 = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="ID3")]['variance'],errors='coerce')
cm_var_id3 = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="ID3")]['variance'],errors='coerce')
dt_var_id3 = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="ID3")]['variance'],errors='coerce')

In [41]:
# Filter for KNN
# Obtain variance for each discretization, convert into numeric, string values will be return as NaN
ewd_var_knn = pd.to_numeric(data[(data['disc']=="EWD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce')
efd_var_knn = pd.to_numeric(data[(data['disc']=="EFD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce')
ffd_var_knn = pd.to_numeric(data[(data['disc']=="FFD") & (data['models']=="Knn-VDM")]['variance'],errors='coerce')
cm_var_knn = pd.to_numeric(data[(data['disc']=="ChiMerge") & (data['models']=="Knn-VDM")]['variance'],errors='coerce')
dt_var_knn = pd.to_numeric(data[(data['disc']=="DT") & (data['models']=="Knn-VDM")]['variance'],errors='coerce')

### 3.2. Welch t-test, variance
Implement 4 replications:
- Regardless algorithms
- Filter for each algorithm: CNB, ID3, Knn

#### 3.2.1 Welch t-test, variance, regardless algorithm

In [42]:
# WELCH VARIANCE
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_var, efd_var, ffd_var, cm_var, dt_var]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [43]:
# Result table
wt_result_var = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_var.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_var

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.272994,0.786492
1,ewd vs ffd,-1.449633,0.155235
2,ewd vs cm,0.685034,0.49768
3,ewd vs dt,-1.185021,0.245281
5,efd vs ffd,-1.249933,0.217791
6,efd vs cm,1.004596,0.320845
7,efd vs dt,-0.947528,0.349312
10,ffd vs cm,2.186553,0.033792
11,ffd vs dt,0.480751,0.633095
15,cm vs dt,-2.033086,0.048678


#### 3.2.2 Welch t-test, variance, only for CNB

In [44]:
# WELCH VARIANCE, CNB
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_var_cnb, efd_var_cnb, ffd_var_cnb, cm_var_cnb, dt_var_cnb]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [45]:
# Result table
wt_result_var_cnb = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_var_cnb.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_var_cnb

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-1.924417,0.089256
1,ewd vs ffd,-5.466387,0.000373
2,ewd vs cm,-0.950409,0.361084
3,ewd vs dt,-1.280163,0.256199
5,efd vs ffd,-3.31219,0.004416
6,efd vs cm,1.652428,0.133596
7,efd vs dt,-0.283074,0.784776
10,ffd vs cm,5.243872,0.00045
11,ffd vs dt,2.05764,0.067634
15,cm vs dt,-1.152338,0.300083


#### 3.2.3 Welch t-test, variance, only for ID3

In [46]:
# WELCH VARIANCE, ID3
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_var_id3, efd_var_id3, ffd_var_id3, cm_var_id3, dt_var_id3]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [47]:
# Result table
wt_result_var_id3 = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_var_id3.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_var_id3

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-0.659762,0.518853
1,ewd vs ffd,-0.419653,0.680207
2,ewd vs cm,0.5214,0.608166
3,ewd vs dt,-0.205142,0.841499
5,efd vs ffd,0.301564,0.766603
6,efd vs cm,1.176981,0.25374
7,efd vs dt,0.683334,0.509083
10,ffd vs cm,0.978933,0.339212
11,ffd vs dt,0.363073,0.721398
15,cm vs dt,-0.868741,0.400061


#### 3.2.4 Welch t-test, variance, only for Knn

In [48]:
# WELCH VARIANCE, knn
# COMPLETE PIPELINE
disc_key = ['ewd', 'efd', 'ffd', 'cm', 'dt']
disc_value = [ewd_var_knn, efd_var_knn, ffd_var_knn, cm_var_knn, dt_var_knn]
t_stat = []
p_value = []
disc_compare = []
# Create dictionary store discretization and series of accuracy
disc = {}
for key in disc_key:
    for value in disc_value:
        disc[key] = value
        disc_value.remove(value)
        break  
# Create loop for t_test
for i in disc_key:
    for j in disc_key:
        if i != j:
            disc_compare.append(f'{i} vs {j}')
            t_stat.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[0]))
            p_value.append((stats.ttest_ind(disc[i], disc[j], nan_policy='omit', equal_var=False)[1]))
# Convert 3 lists to dataframe
disc_compare = pd.DataFrame(disc_compare, columns=['disc_compare'])
t_stat = pd.DataFrame(t_stat, columns=['t_stat'])
p_value = pd.DataFrame(p_value, columns=['p_value'])

In [49]:
# Result table
wt_result_var_knn = pd.concat([disc_compare, t_stat, p_value], axis = 1)
# Drop duplicate
wt_result_var_knn.drop_duplicates(subset=['p_value'], inplace=True)
wt_result_var_knn

Unnamed: 0,disc_compare,t_stat,p_value
0,ewd vs efd,-1.174751,0.355935
1,ewd vs ffd,-1.44791,0.243254
2,ewd vs cm,0.823212,0.559
3,ewd vs dt,-3.163168,0.049237
5,efd vs ffd,-1.162247,0.322659
6,efd vs cm,1.349668,0.315226
7,efd vs dt,-2.004615,0.103633
10,ffd vs cm,1.634027,0.186512
11,ffd vs dt,0.374142,0.728997
15,cm vs dt,-2.848958,0.063856
