# Classification models

Dataset: iris
By: Sam
Update at: 28/06/2022

====

Summary:<br>
- Import unsupervised discretised datasets (already encoded categorical attributes)
- Split dataset: 75% training, 25% testing, seed = 30
- Perform 3 classification models: ID3, Naive Bayes, Knn-VDM
- Cross validation (accuracy): 10 folds, repeats: 3

In [18]:
import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

In [19]:
# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [20]:
# RIPPER (https://pypi.org/project/wittgenstein/) Only for binary
import wittgenstein as lw 

In [21]:
# For Naive Bayes
from sklearn.naive_bayes import CategoricalNB # Categorical Naive Bayes
from sklearn.naive_bayes import MultinomialNB # Multinominal Naive Bayes (suitable for NLP)
from mixed_naive_bayes import MixedNB # Mixed Naive Bayes for combination of both discrete & continuous feature

In [22]:
# For decision tree ID3 
# https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator # ID3 Decision Tree (https://pypi.org/project/decision-tree-id3/)
from id3 import export_graphviz

In [23]:
# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

In [24]:
# For model evaluation
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

In [25]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. EWD data

## 1.1 EWD, k = 4

In [26]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('iris_ewd1.csv')
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models - EWD, k=4

In [27]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.65      1.00      0.79        11
           2       1.00      0.60      0.75        15

    accuracy                           0.84        38
   macro avg       0.88      0.87      0.85        38
weighted avg       0.90      0.84      0.84        38

Time for training model ID3 - default, EWD, k = 4 is: 0.008698225021362305.


In [28]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.73      1.00      0.85        11
           2       1.00      0.73      0.85        15

    accuracy                           0.89        38
   macro avg       0.91      0.91      0.90        38
weighted avg       0.92      0.89      0.89        38

Time for training model Naive Bayes - default, EWD, k = 4 is: 0.007327079772949219.


In [29]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.71      0.91      0.80        11
           2       0.92      0.73      0.81        15

    accuracy                           0.87        38
   macro avg       0.88      0.88      0.87        38
weighted avg       0.88      0.87      0.87        38

Time for training model Knn-VDM, EWD, k = 4 is: 0.8173389434814453.


In [None]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

### Evaluation, EDW, k = 4

In [30]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [31]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.174
Average bias: 0.158
Average variance: 0.055
Sklearn 0-1 loss: 0.158


In [39]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 6 is out of bounds for axis 1 with size 6

In [32]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.107
Average bias: 0.105
Average variance: 0.035
Sklearn 0-1 loss: 0.132


## 1.2 EWD, k = 7

In [35]:
# Complete code for data preperation
# Read data
df_ewd2 = pd.read_csv('iris_ewd2.csv')
disc = 'EWD'
k = 7

df_ewd2.info()
data = df_ewd2.values
data.shape

features = df_ewd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models - EWD, k=7

In [36]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.60      0.82      0.69        11
           2       0.82      0.60      0.69        15

    accuracy                           0.79        38
   macro avg       0.81      0.81      0.79        38
weighted avg       0.81      0.79      0.79        38

Time for training model ID3 - default, EWD, k = 7 is: 0.010634899139404297.


In [40]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.79      1.00      0.88        11
           2       1.00      0.80      0.89        15

    accuracy                           0.92        38
   macro avg       0.93      0.93      0.92        38
weighted avg       0.94      0.92      0.92        38

Time for training model Naive Bayes - default, EWD, k = 7 is: 0.006596803665161133.


In [41]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.91      0.95        11
           2       0.94      1.00      0.97        15

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38

Time for training model Knn-VDM, EWD, k = 7 is: 0.8597049713134766.


In [42]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EWD, k = 7.
ID3: - Mean: 0.868889, Standard deviation: 0.105386
CNB: - Mean: 0.886667, Standard deviation: 0.092936
Knn-VDM: - Mean: 0.935556, Standard deviation: 0.053008


### Evaluation, EWD, k=7

In [43]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [44]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.162
Average bias: 0.158
Average variance: 0.054
Sklearn 0-1 loss: 0.211


In [46]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 6 is out of bounds for axis 1 with size 6

In [47]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.048
Average bias: 0.026
Average variance: 0.040
Sklearn 0-1 loss: 0.026


## 1.3 EWD, k = 10

In [48]:
# Complete code for data preperation
# Read data
df_ewd3 = pd.read_csv('iris_ewd3.csv')
disc = 'EWD'
k = 10

df_ewd3.info()
data = df_ewd3.values
data.shape

features = df_ewd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, EWD, k=10

In [49]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model ID3 - default, EWD, k = 10 is: 0.01064300537109375.


In [50]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Naive Bayes - default, EWD, k = 10 is: 0.006440877914428711.


In [51]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Knn-VDM, EWD, k = 10 is: 0.8562920093536377.


In [52]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EWD, k = 10.
ID3: - Mean: 0.955556, Standard deviation: 0.055333
CNB: - Mean: nan, Standard deviation: nan
Knn-VDM: - Mean: 0.966667, Standard deviation: 0.047920


### Evaluation, EWD, k=10

In [53]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [54]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.066
Average bias: 0.053
Average variance: 0.014
Sklearn 0-1 loss: 0.053


In [55]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 8 is out of bounds for axis 1 with size 8

In [56]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.057
Average bias: 0.053
Average variance: 0.004
Sklearn 0-1 loss: 0.053


# 2. EFD datasets

## 2.1 EFD, k = 4

In [57]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('iris_efd1.csv')
disc = 'EFD'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, EFD, k=4

In [58]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.65      1.00      0.79        11
           2       1.00      0.60      0.75        15

    accuracy                           0.84        38
   macro avg       0.88      0.87      0.85        38
weighted avg       0.90      0.84      0.84        38

Time for training model ID3 - default, EFD, k = 4 is: 0.009439945220947266.


In [59]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.73      1.00      0.85        11
           2       1.00      0.80      0.89        15

    accuracy                           0.89        38
   macro avg       0.91      0.91      0.90        38
weighted avg       0.92      0.89      0.90        38

Time for training model Naive Bayes - default, EFD, k = 4 is: 0.0066530704498291016.


In [60]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.79      1.00      0.88        11
           2       1.00      0.87      0.93        15

    accuracy                           0.92        38
   macro avg       0.93      0.93      0.92        38
weighted avg       0.94      0.92      0.92        38

Time for training model Knn-VDM, EFD, k = 4 is: 0.8460838794708252.


In [61]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 4.
ID3: - Mean: 0.880000, Standard deviation: 0.094907
CNB: - Mean: 0.866667, Standard deviation: 0.091084
Knn-VDM: - Mean: 0.926667, Standard deviation: 0.075719


### Evaluation, EFD, k=4

In [62]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [63]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.178
Average bias: 0.158
Average variance: 0.049
Sklearn 0-1 loss: 0.158


In [64]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.100
Average bias: 0.105
Average variance: 0.022
Sklearn 0-1 loss: 0.105


In [65]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.095
Average bias: 0.105
Average variance: 0.071
Sklearn 0-1 loss: 0.079


## 2.2 EFD, k = 7 (iris_efd2)

In [66]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('iris_efd2.csv')
disc = 'EFD'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, EFD, k=7

In [67]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.85      1.00      0.92        11
           2       1.00      0.87      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.96      0.95        38
weighted avg       0.96      0.95      0.95        38

Time for training model ID3 - default, EFD, k = 7 is: 0.010675907135009766.


In [68]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.85      1.00      0.92        11
           2       1.00      0.87      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.96      0.95        38
weighted avg       0.96      0.95      0.95        38

Time for training model Naive Bayes - default, EFD, k = 7 is: 0.007373809814453125.


In [69]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.82      0.90        11
           2       0.88      1.00      0.94        15

    accuracy                           0.95        38
   macro avg       0.96      0.94      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Knn-VDM, EFD, k = 7 is: 0.8499641418457031.


In [70]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 7.
ID3: - Mean: 0.920000, Standard deviation: 0.065320
CNB: - Mean: 0.904444, Standard deviation: 0.058836
Knn-VDM: - Mean: 0.962222, Standard deviation: 0.047713


### Evaluation, EFD, k=7 

In [71]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [72]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.108
Average bias: 0.053
Average variance: 0.070
Sklearn 0-1 loss: 0.053


In [73]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.066
Average bias: 0.053
Average variance: 0.025
Sklearn 0-1 loss: 0.053


In [74]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.071
Average bias: 0.053
Average variance: 0.044
Sklearn 0-1 loss: 0.053


## 2.3 EFD, k =10 (iris_efd3)

In [75]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('iris_efd3.csv')
disc = 'EFD'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, EFD, k=10

In [76]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        12
           1       0.79      1.00      0.88        11
           2       1.00      0.87      0.93        15

    accuracy                           0.92        38
   macro avg       0.93      0.93      0.92        38
weighted avg       0.94      0.92      0.92        38

Time for training model ID3 - default, EFD, k = 10 is: 0.011312246322631836.


In [77]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Naive Bayes - default, EFD, k = 10 is: 0.008018016815185547.


In [78]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Knn-VDM, EFD, k = 10 is: 1.0276598930358887.


In [79]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, k = {k}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, EFD, k = 10.
ID3: - Mean: 0.913333, Standard deviation: 0.073333
CNB: - Mean: 0.928889, Standard deviation: 0.061904
Knn-VDM: - Mean: 0.944444, Standard deviation: 0.054659


### Evaluation, EFD, k=10

In [80]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [81]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.110
Average bias: 0.053
Average variance: 0.087
Sklearn 0-1 loss: 0.079


In [82]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.068
Average bias: 0.053
Average variance: 0.046
Sklearn 0-1 loss: 0.053


In [83]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.052
Average bias: 0.053
Average variance: 0.037
Sklearn 0-1 loss: 0.053


# 3. FFD datasets

## 3.1 FFD, m =10 (iris_ffd1)

In [84]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('iris_ffd1.csv')
disc = 'FFD'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, FFD, m=10

In [85]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model ID3 - default, FFD, m = 10 is: 0.012059926986694336.


In [86]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.91      0.95        11
           2       0.94      1.00      0.97        15

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38

Time for training model Naive Bayes - default, FFD, m = 10 is: 0.006140232086181641.


In [87]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.91      0.95        11
           2       0.94      1.00      0.97        15

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38

Time for training model Knn-VDM, FFD, m = 10 is: 0.917590856552124.


In [88]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 10.
ID3: - Mean: 0.948889, Standard deviation: 0.063674
CNB: - Mean: 0.935556, Standard deviation: 0.058331
Knn-VDM: - Mean: 0.980000, Standard deviation: 0.035066


### Evaluation, FFD, m=10

In [89]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [90]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.084
Average bias: 0.053
Average variance: 0.032
Sklearn 0-1 loss: 0.053


In [91]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 12 is out of bounds for axis 1 with size 12

In [92]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.037
Average bias: 0.026
Average variance: 0.011
Sklearn 0-1 loss: 0.026


## 3.2 FFD, m = 30 (iris_ffd2)

In [93]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('iris_ffd2.csv')
disc = 'FFD'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, FFD, m=30

In [94]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.92      1.00      0.96        11
           2       1.00      0.93      0.97        15

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38

Time for training model ID3 - default, FFD, m = 30 is: 0.013934850692749023.


In [95]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.90      0.82      0.86        11
           2       0.93      0.93      0.93        15

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38

Time for training model Naive Bayes - default, FFD, m = 30 is: 0.0066907405853271484.


In [96]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      0.91      0.91        11
           2       0.93      0.93      0.93        15

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

Time for training model Knn-VDM, FFD, m = 30 is: 0.9262831211090088.


In [97]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 30.
ID3: - Mean: 0.913333, Standard deviation: 0.066999
CNB: - Mean: 0.917778, Standard deviation: 0.053564
Knn-VDM: - Mean: 0.933333, Standard deviation: 0.068853


### Evaluation, FFD, m=30

In [98]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [100]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.096
Average bias: 0.053
Average variance: 0.082
Sklearn 0-1 loss: 0.026


In [101]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.122
Average bias: 0.079
Average variance: 0.061
Sklearn 0-1 loss: 0.079


In [102]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.081
Average bias: 0.053
Average variance: 0.040
Sklearn 0-1 loss: 0.053


## 3.3 FFD, m = 60 (iris_ffd3)

In [103]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('iris_ffd3.csv')
disc = 'FFD'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, FFD, m= 60

In [104]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.75      0.82      0.78        11
           2       0.88      1.00      0.94        15

    accuracy                           0.87        38
   macro avg       0.88      0.86      0.86        38
weighted avg       0.88      0.87      0.87        38

Time for training model ID3 - default, FFD, m = 60 is: 0.010174036026000977.


In [105]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.67      1.00      0.80        12
           1       0.75      0.27      0.40        11
           2       0.88      0.93      0.90        15

    accuracy                           0.76        38
   macro avg       0.76      0.74      0.70        38
weighted avg       0.77      0.76      0.72        38

Time for training model Naive Bayes - default, FFD, m = 60 is: 0.008380889892578125.


In [106]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.69      0.82      0.75        11
           2       0.88      0.93      0.90        15

    accuracy                           0.84        38
   macro avg       0.86      0.83      0.84        38
weighted avg       0.86      0.84      0.84        38

Time for training model Knn-VDM, FFD, m = 60 is: 1.1547188758850098.


In [107]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 60.
ID3: - Mean: 0.813333, Standard deviation: 0.103852
CNB: - Mean: 0.711111, Standard deviation: 0.093227
Knn-VDM: - Mean: 0.528889, Standard deviation: 0.110129


### Evaluation, FFD, m=60

In [108]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [109]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.172
Average bias: 0.132
Average variance: 0.059
Sklearn 0-1 loss: 0.132


In [110]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.213
Average bias: 0.237
Average variance: 0.059
Sklearn 0-1 loss: 0.237


In [111]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.260
Average bias: 0.132
Average variance: 0.189
Sklearn 0-1 loss: 0.158


## 3.4 FFD, m = 100 (iris_ffd4)

In [112]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('iris_ffd4.csv')
disc = 'FFD'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('label', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 30)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   slength  150 non-null    int64
 1   swidth   150 non-null    int64
 2   plength  150 non-null    int64
 3   pwidth   150 non-null    int64
 4   label    150 non-null    int64
dtypes: int64(5)
memory usage: 6.0 KB
(150, 4) (150,)
Class representation - original:  Counter({0: 50, 1: 50, 2: 50})
Class representation - training data:  Counter({1: 39, 0: 38, 2: 35})
Class representation - testing data:  Counter({2: 15, 0: 12, 1: 11})


### Models, FFD, m=100

In [113]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.75      0.82      0.78        11
           2       0.88      1.00      0.94        15

    accuracy                           0.87        38
   macro avg       0.88      0.86      0.86        38
weighted avg       0.88      0.87      0.87        38

Time for training model ID3 - default, FFD, m = 100 is: 0.01341700553894043.


In [114]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.67      1.00      0.80        12
           1       0.75      0.27      0.40        11
           2       0.88      0.93      0.90        15

    accuracy                           0.76        38
   macro avg       0.76      0.74      0.70        38
weighted avg       0.77      0.76      0.72        38

Time for training model Naive Bayes - default, FFD, m = 100 is: 0.008496999740600586.


In [115]:
# Knn-VDM complete code
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train, y_train, continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train, y_train)
# Testing
y_pred_knn = knn_vdm.predict(x_test)
knn_vdm.classes_
print(classification_report(y_test, y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.75      0.86        12
           1       0.69      0.82      0.75        11
           2       0.88      0.93      0.90        15

    accuracy                           0.84        38
   macro avg       0.86      0.83      0.84        38
weighted avg       0.86      0.84      0.84        38

Time for training model Knn-VDM, FFD, m = 100 is: 1.1199231147766113.


In [116]:
# CROSS VALIDATION
import warnings
warnings.filterwarnings('ignore')

# param
num_folds = 10
num_repeat = 3
seed = 7
scores = 'accuracy'

print(f'Cross validation result, {scores}, {disc}, m = {m}.')

# Create list of algorithms
models = []
models.append(('ID3', Id3Estimator()))
#models.append(('RIPPER', lw.RIPPER()))
models.append(('CNB', CategoricalNB()))
models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
  #kfold = KFold(n_splits=num_folds, shuffle = True, random_state=10)
    kfold = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeat, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scores)
    results.append(cv_results)
    names.append(name)
    msg = '%s: - Mean: %f, Standard deviation: %f' % (name, cv_results.mean(), cv_results.std())
    print(msg)

Cross validation result, accuracy, FFD, m = 100.
ID3: - Mean: 0.813333, Standard deviation: 0.103852
CNB: - Mean: 0.711111, Standard deviation: 0.093227
Knn-VDM: - Mean: 0.528889, Standard deviation: 0.110129


### Evaluation, FFD, m=100

In [117]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [118]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.172
Average bias: 0.132
Average variance: 0.059
Sklearn 0-1 loss: 0.132


In [119]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.213
Average bias: 0.237
Average variance: 0.059
Sklearn 0-1 loss: 0.237


In [120]:
# Knn
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

Average expected loss: 0.260
Average bias: 0.132
Average variance: 0.189
Sklearn 0-1 loss: 0.158
