# Classification models

Dataset: pendigits<br>
By: Sam<br>
Update at: 29/06/2022<br>

====

Summary:<br>
- Import unsupervised discretised datasets (already encoded categorical attributes)
- Split dataset: 75% training, 15% validation, 10% testing, seed = 30
- Perform classification models: ID3, Naive Bayes. Knn-VDM is not applicable for the whole dataset due to long time training (brute force algorithm)
- No cross validation
- Evaluation performance metrics: Accuracy, F1-score (average/ macro), Bias, Variance

### About Dataset

pendigits.tra: Training	7494<br>
pendigits.tes: Testing	3498<br>
The way we used the dataset was to use first half of training for  actual training, one-fourth for validation and one-fourth for writer-dependent testing. The test set was used for writer-independent testing and is the actual quality measure.<br>
Number of Attributes: 16 input + 1 class attribute (10 classes from 0-9)<br>
The input vector size is 2xT, two times the number of points resampled. We considered spatial resampling to T=8,12,16 points in our experiments and found that T=8 gave the best trade-off between accuracy and complexity.<br>
All attributes are numeric.<br>
No missing value, balanced class<br>

In [1]:
import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

In [2]:
# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# RIPPER (https://pypi.org/project/wittgenstein/) Only for binary
import wittgenstein as lw 

In [4]:
# For Naive Bayes
from sklearn.naive_bayes import CategoricalNB # Categorical Naive Bayes
from sklearn.naive_bayes import MultinomialNB # Multinominal Naive Bayes (suitable for NLP)
from mixed_naive_bayes import MixedNB # Mixed Naive Bayes for combination of both discrete & continuous feature

In [5]:
# For decision tree ID3 
# https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator # ID3 Decision Tree (https://pypi.org/project/decision-tree-id3/)
from id3 import export_graphviz

In [6]:
# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# For model evaluation
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

# EWD data

## EWD, k = 4

In [9]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('pendigits_ewd1.csv')
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ewd1['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EWD, k=4

In [12]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       0.98      0.98      0.98       114
           1       0.95      0.82      0.88       115
           2       0.86      0.99      0.92       114
           3       0.90      0.97      0.94       106
           4       0.98      0.95      0.96       114
           5       0.93      0.95      0.94       105
           6       0.98      0.98      0.98       106
           7       0.96      0.94      0.95       114
           8       0.97      0.97      0.97       106
           9       0.94      0.91      0.92       106

    accuracy                           0.95      1100
   macro avg       0.95      0.95      0.95      1100
weighted avg       0.95      0.95      0.95      1100

Time for training model ID3 - default, EWD, k = 4 is: 0.637890100479126.


In [13]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       114
           1       0.70      0.68      0.69       115
           2       0.80      0.93      0.86       114
           3       0.84      0.96      0.89       106
           4       0.99      0.94      0.96       114
           5       0.85      0.57      0.68       105
           6       0.94      0.99      0.96       106
           7       0.97      0.88      0.92       114
           8       0.86      0.82      0.84       106
           9       0.76      0.92      0.83       106

    accuracy                           0.86      1100
   macro avg       0.86      0.86      0.86      1100
weighted avg       0.86      0.86      0.86      1100

Time for training model Naive Bayes - default, EWD, k = 4 is: 0.02654099464416504.


In [10]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        50
           1       0.85      0.79      0.81        42
           2       0.86      0.96      0.91        46
           3       0.83      0.98      0.90        50
           4       0.98      0.81      0.89        58
           5       0.81      0.88      0.85        50
           6       0.84      1.00      0.91        46
           7       0.89      0.92      0.91        53
           8       1.00      0.84      0.91        44
           9       0.92      0.80      0.86        61

    accuracy                           0.89       500
   macro avg       0.90      0.89      0.89       500
weighted avg       0.90      0.89      0.89       500

Time for training model Knn-VDM, EWD, k = 4 is: 481.4514729976654.


In [18]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EWD, k = 4.
ID3, validation accuracy:, 0.9526699029126213
CNB, validation accuracy:, 0.8549757281553398


### Evaluation, EWD, k=4

In [11]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [20]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.066
Average bias: 0.050
Average variance: 0.046
Sklearn 0-1 loss: 0.055


In [21]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.141
Average bias: 0.140
Average variance: 0.012
Sklearn 0-1 loss: 0.139


In [12]:
# Knn - SAMPLE 500
import time
start = time.time() # For measuring time execution

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
knn_vdm, x_train[:500,:], y_train[:500], x_test[:500,:], y_test[:500],
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

end = time.time()
print(f'Time for evaluation model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyboardInterrupt: 

## EWD, k = 7

In [13]:
# Complete code for data preperation
# Read data
df_ewd2 = pd.read_csv('pendigits_ewd2.csv')
disc = 'EWD'
k = 7

df_ewd2.info()
data = df_ewd2.values
data.shape

features = df_ewd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ewd2['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EWD, k=7

In [24]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.')


              precision    recall  f1-score   support

           0       0.99      0.97      0.98       114
           1       0.95      0.90      0.93       115
           2       0.92      0.96      0.94       114
           3       0.94      0.95      0.94       106
           4       0.97      0.99      0.98       114
           5       0.94      0.94      0.94       105
           6       0.98      0.99      0.99       106
           7       0.97      0.96      0.97       114
           8       0.99      0.97      0.98       106
           9       0.93      0.93      0.93       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, EWD, k = 7 is: 0.6859009265899658.


In [25]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.')

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       114
           1       0.74      0.69      0.71       115
           2       0.80      0.94      0.86       114
           3       0.88      0.95      0.91       106
           4       1.00      0.96      0.98       114
           5       0.90      0.58      0.71       105
           6       0.97      0.98      0.98       106
           7       0.96      0.89      0.92       114
           8       0.87      0.82      0.84       106
           9       0.75      0.97      0.84       106

    accuracy                           0.87      1100
   macro avg       0.88      0.87      0.87      1100
weighted avg       0.88      0.87      0.87      1100

Time for training model Naive Bayes - default, EWD, k = 7 is: 0.018597841262817383.


In [14]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.84      0.76      0.80        42
           2       0.85      0.96      0.90        46
           3       0.84      0.96      0.90        50
           4       0.98      0.93      0.96        58
           5       0.91      0.82      0.86        50
           6       0.94      0.98      0.96        46
           7       0.87      0.98      0.92        53
           8       1.00      0.89      0.94        44
           9       0.91      0.84      0.87        61

    accuracy                           0.91       500
   macro avg       0.91      0.91      0.91       500
weighted avg       0.92      0.91      0.91       500

Time for training model Knn-VDM, EWD, k = 7 is: 482.4279270172119.


In [27]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
#models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EWD, k = 7.
ID3, validation accuracy:, 0.9672330097087378
CNB, validation accuracy:, 0.8713592233009708


### Evaluation, EWD, k= 7

In [28]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [29]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.052
Average bias: 0.021
Average variance: 0.044
Sklearn 0-1 loss: 0.042


In [30]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.129
Average bias: 0.129
Average variance: 0.011
Sklearn 0-1 loss: 0.129


In [31]:
# # Knn - SAMPLE 500 - WARNING: LONG TIME (>> 1 HOURS)
# import time
# start = time.time() # For measuring time execution

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train[:500,:], y_train[:500], x_test[:500,:], y_test[:500],
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# end = time.time()
# print(f'Time for evaluation model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

## EWD, k = 10

In [17]:
# Complete code for data preperation
# Read data
df_ewd3 = pd.read_csv('pendigits_ewd3.csv')
disc = 'EWD'
k = 10

df_ewd3.info()
data = df_ewd3.values
data.shape

features = df_ewd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ewd3['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EWD, k=10

In [33]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.')


              precision    recall  f1-score   support

           0       0.98      0.98      0.98       114
           1       0.82      0.88      0.85       115
           2       0.92      0.89      0.90       114
           3       0.96      0.94      0.95       106
           4       0.97      0.96      0.96       114
           5       0.94      0.94      0.94       105
           6       0.98      0.98      0.98       106
           7       0.95      0.97      0.96       114
           8       0.97      0.96      0.97       106
           9       0.92      0.91      0.91       106

    accuracy                           0.94      1100
   macro avg       0.94      0.94      0.94      1100
weighted avg       0.94      0.94      0.94      1100

Time for training model ID3 - default, EWD, k = 10 is: 0.7593410015106201.


In [34]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.')

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       114
           1       0.75      0.69      0.71       115
           2       0.81      0.93      0.87       114
           3       0.88      0.98      0.93       106
           4       1.00      0.96      0.98       114
           5       0.88      0.58      0.70       105
           6       0.98      0.98      0.98       106
           7       0.96      0.89      0.92       114
           8       0.85      0.82      0.84       106
           9       0.74      0.97      0.84       106

    accuracy                           0.87      1100
   macro avg       0.88      0.87      0.87      1100
weighted avg       0.88      0.87      0.87      1100

Time for training model Naive Bayes - default, EWD, k = 10 is: 0.017505884170532227.


In [18]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyError: 1.0

In [35]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EWD, k = 10.
ID3, validation accuracy:, 0.9641990291262136
CNB, validation accuracy:, 0.8774271844660194


### Evaluation, EWD, k= 10

In [36]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [37]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.052
Average bias: 0.023
Average variance: 0.044
Sklearn 0-1 loss: 0.059


In [38]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.127
Average bias: 0.126
Average variance: 0.014
Sklearn 0-1 loss: 0.127


In [39]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# EFD

## EFD, k = 4

In [19]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('pendigits_efd1.csv')
disc = 'EFD'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_efd1['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EFD, k=4

In [41]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.')


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       114
           1       0.91      0.90      0.90       115
           2       0.92      0.96      0.94       114
           3       0.93      0.96      0.94       106
           4       0.97      0.96      0.97       114
           5       0.95      0.90      0.92       105
           6       0.98      1.00      0.99       106
           7       0.96      0.94      0.95       114
           8       0.96      0.95      0.96       106
           9       0.93      0.94      0.93       106

    accuracy                           0.95      1100
   macro avg       0.95      0.95      0.95      1100
weighted avg       0.95      0.95      0.95      1100

Time for training model ID3 - default, EFD, k = 4 is: 0.6160540580749512.


In [42]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.')

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       114
           1       0.71      0.69      0.70       115
           2       0.81      0.89      0.85       114
           3       0.88      0.95      0.91       106
           4       0.98      0.96      0.97       114
           5       0.84      0.55      0.67       105
           6       0.99      0.95      0.97       106
           7       0.95      0.84      0.89       114
           8       0.81      0.77      0.79       106
           9       0.72      0.96      0.82       106

    accuracy                           0.85      1100
   macro avg       0.86      0.85      0.85      1100
weighted avg       0.86      0.85      0.85      1100

Time for training model Naive Bayes - default, EFD, k = 4 is: 0.020608901977539062.


In [20]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        50
           1       0.69      0.88      0.77        42
           2       0.93      0.91      0.92        46
           3       0.94      0.98      0.96        50
           4       0.98      0.93      0.96        58
           5       0.94      0.88      0.91        50
           6       0.92      1.00      0.96        46
           7       0.93      0.94      0.93        53
           8       0.97      0.84      0.90        44
           9       0.95      0.89      0.92        61

    accuracy                           0.92       500
   macro avg       0.92      0.92      0.92       500
weighted avg       0.93      0.92      0.92       500

Time for training model Knn-VDM, EFD, k = 4 is: 485.34148502349854.


In [43]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EFD, k = 4.
ID3, validation accuracy:, 0.9550970873786407
CNB, validation accuracy:, 0.8555825242718447


### Evaluation, EFD, k=4

In [46]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [47]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.062
Average bias: 0.026
Average variance: 0.051
Sklearn 0-1 loss: 0.050


In [48]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.150
Average bias: 0.147
Average variance: 0.015
Sklearn 0-1 loss: 0.149


In [49]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## EFD, k = 7

In [21]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('pendigits_efd2.csv')
disc = 'EFD'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_efd2['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EFD, k=7

In [51]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.')


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       114
           1       0.91      0.92      0.92       115
           2       0.97      0.95      0.96       114
           3       0.96      0.96      0.96       106
           4       0.98      0.98      0.98       114
           5       0.94      0.94      0.94       105
           6       1.00      1.00      1.00       106
           7       0.93      0.97      0.95       114
           8       0.99      0.95      0.97       106
           9       0.93      0.93      0.93       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, EFD, k = 7 is: 0.6947641372680664.


In [52]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.')

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       114
           1       0.75      0.70      0.72       115
           2       0.81      0.91      0.86       114
           3       0.84      0.95      0.89       106
           4       1.00      0.96      0.98       114
           5       0.88      0.58      0.70       105
           6       0.98      0.98      0.98       106
           7       0.93      0.88      0.90       114
           8       0.86      0.81      0.83       106
           9       0.73      0.95      0.83       106

    accuracy                           0.87      1100
   macro avg       0.87      0.87      0.86      1100
weighted avg       0.87      0.87      0.86      1100

Time for training model Naive Bayes - default, EFD, k = 7 is: 0.01694202423095703.


In [22]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        50
           1       0.80      0.93      0.86        42
           2       0.95      0.91      0.93        46
           3       0.86      1.00      0.93        50
           4       0.95      0.90      0.92        58
           5       0.98      0.86      0.91        50
           6       0.94      0.96      0.95        46
           7       0.89      0.96      0.93        53
           8       0.98      0.91      0.94        44
           9       0.95      0.87      0.91        61

    accuracy                           0.92       500
   macro avg       0.93      0.93      0.92       500
weighted avg       0.93      0.92      0.92       500

Time for training model Knn-VDM, EFD, k = 7 is: 481.2034821510315.


In [54]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EFD, k = 7.
ID3, validation accuracy:, 0.9557038834951457
CNB, validation accuracy:, 0.8756067961165048


### Evaluation, EFD, k=7

In [55]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [56]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.057
Average bias: 0.022
Average variance: 0.047
Sklearn 0-1 loss: 0.040


In [57]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.132
Average bias: 0.134
Average variance: 0.011
Sklearn 0-1 loss: 0.134


In [58]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## EFD, k =10

In [23]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('pendigits_efd3.csv')
disc = 'EFD'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_efd3['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, EFD, k=10

In [60]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.')


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       114
           1       0.93      0.90      0.91       115
           2       0.92      0.96      0.94       114
           3       0.93      0.96      0.94       106
           4       0.98      0.98      0.98       114
           5       0.96      0.93      0.95       105
           6       0.98      0.97      0.98       106
           7       0.95      0.99      0.97       114
           8       0.98      0.94      0.96       106
           9       0.94      0.93      0.94       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, EFD, k = 10 is: 0.780156135559082.


In [61]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       114
           1       0.77      0.71      0.74       115
           2       0.82      0.93      0.87       114
           3       0.87      0.96      0.91       106
           4       0.99      0.97      0.98       114
           5       0.91      0.61      0.73       105
           6       0.97      0.99      0.98       106
           7       0.96      0.88      0.92       114
           8       0.90      0.85      0.87       106
           9       0.75      1.00      0.85       106

    accuracy                           0.88      1100
   macro avg       0.89      0.88      0.88      1100
weighted avg       0.89      0.88      0.88      1100

Time for training model Naive Bayes - default, EFD, k = 10 is: 0.018495798110961914.


In [24]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       0.78      0.93      0.85        42
           2       0.93      0.85      0.89        46
           3       0.86      0.98      0.92        50
           4       0.96      0.91      0.94        58
           5       0.96      0.88      0.92        50
           6       0.94      0.96      0.95        46
           7       0.88      0.96      0.92        53
           8       0.97      0.86      0.92        44
           9       0.95      0.89      0.92        61

    accuracy                           0.92       500
   macro avg       0.92      0.92      0.92       500
weighted avg       0.93      0.92      0.92       500

Time for training model Knn-VDM, EFD, k = 10 is: 496.08610677719116.


In [65]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, EFD, k = 10.
ID3, validation accuracy:, 0.9587378640776699
CNB, validation accuracy:, 0.8786407766990292


### Evaluation, EFD, k= 10

In [66]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [67]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.056
Average bias: 0.024
Average variance: 0.048
Sklearn 0-1 loss: 0.044


In [68]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.123
Average bias: 0.117
Average variance: 0.016
Sklearn 0-1 loss: 0.117


In [None]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

In [69]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# FFD

## FFD, m =10

In [25]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('pendigits_ffd1.csv')
disc = 'FFD'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ffd1['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, FFD, m=10

In [71]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      0.99      1.00       114
           1       0.90      0.93      0.91       115
           2       0.96      0.92      0.94       114
           3       0.93      0.94      0.94       106
           4       0.99      0.98      0.99       114
           5       0.97      0.96      0.97       105
           6       0.97      0.98      0.98       106
           7       0.93      0.97      0.95       114
           8       0.99      0.97      0.98       106
           9       0.94      0.93      0.94       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, FFD, m = 10 is: 2.4367880821228027.


In [72]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       114
           1       0.77      0.70      0.74       115
           2       0.80      0.91      0.85       114
           3       0.82      0.97      0.89       106
           4       1.00      0.96      0.98       114
           5       0.96      0.63      0.76       105
           6       0.95      0.98      0.96       106
           7       0.93      0.88      0.90       114
           8       0.88      0.72      0.79       106
           9       0.75      0.94      0.83       106

    accuracy                           0.86      1100
   macro avg       0.87      0.86      0.86      1100
weighted avg       0.87      0.86      0.86      1100

Time for training model Naive Bayes - default, FFD, m = 10 is: 0.01919388771057129.


In [26]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 76.0

In [74]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, FFD, k = 10.
ID3, validation accuracy:, 0.9629854368932039
CNB, validation accuracy:, 0.8671116504854369


### Evaluation, FFD, m= 10

In [75]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [76]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.052
Average bias: 0.017
Average variance: 0.048
Sklearn 0-1 loss: 0.041


In [77]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.146
Average bias: 0.140
Average variance: 0.043
Sklearn 0-1 loss: 0.138


In [78]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 30, pendigits_ffd2

In [27]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('pendigits_ffd2.csv')
disc = 'FFD'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ffd2['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, FFD, m=30

In [80]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      0.99      1.00       114
           1       0.89      0.93      0.91       115
           2       0.96      0.91      0.94       114
           3       0.93      0.94      0.94       106
           4       0.99      0.98      0.99       114
           5       0.97      0.96      0.97       105
           6       0.97      0.98      0.98       106
           7       0.93      0.97      0.95       114
           8       0.99      0.97      0.98       106
           9       0.94      0.93      0.94       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, FFD, m = 30 is: 2.528325080871582.


In [81]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       114
           1       0.76      0.70      0.73       115
           2       0.80      0.91      0.85       114
           3       0.83      0.97      0.90       106
           4       1.00      0.96      0.98       114
           5       0.94      0.63      0.75       105
           6       0.95      0.98      0.96       106
           7       0.93      0.88      0.90       114
           8       0.88      0.71      0.79       106
           9       0.75      0.94      0.83       106

    accuracy                           0.86      1100
   macro avg       0.87      0.86      0.86      1100
weighted avg       0.87      0.86      0.86      1100

Time for training model Naive Bayes - default, FFD, m = 30 is: 0.021018028259277344.


In [29]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 70.0

In [83]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
#models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, FFD, k = 10.
ID3, validation accuracy:, 0.9635922330097088
CNB, validation accuracy:, 0.8695388349514563


### Evaluation, FFD, m= 30

In [84]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [85]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.052
Average bias: 0.016
Average variance: 0.048
Sklearn 0-1 loss: 0.042


In [86]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.146
Average bias: 0.140
Average variance: 0.042
Sklearn 0-1 loss: 0.139


In [87]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 60, pendigits_ffd3

In [30]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('pendigits_ffd3.csv')
disc = 'FFD'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ffd3['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, FFD, m=60

In [90]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           0       1.00      0.99      1.00       114
           1       0.89      0.93      0.91       115
           2       0.96      0.91      0.94       114
           3       0.93      0.94      0.94       106
           4       0.99      0.98      0.99       114
           5       0.97      0.96      0.97       105
           6       0.97      0.98      0.98       106
           7       0.93      0.97      0.95       114
           8       0.99      0.97      0.98       106
           9       0.94      0.93      0.94       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, FFD, m = 60 is: 2.300124168395996.


In [91]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       114
           1       0.76      0.71      0.74       115
           2       0.81      0.91      0.86       114
           3       0.84      0.97      0.90       106
           4       1.00      0.96      0.98       114
           5       0.94      0.64      0.76       105
           6       0.95      0.98      0.97       106
           7       0.93      0.88      0.90       114
           8       0.88      0.70      0.78       106
           9       0.74      0.94      0.83       106

    accuracy                           0.86      1100
   macro avg       0.87      0.86      0.86      1100
weighted avg       0.87      0.86      0.86      1100

Time for training model Naive Bayes - default, FFD, m = 60 is: 0.019541025161743164.


In [31]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 66.0

In [93]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, FFD, k = 10.
ID3, validation accuracy:, 0.9635922330097088
CNB, validation accuracy:, 0.8725728155339806


### Evaluation, FFD, m=60

In [94]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [95]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.053
Average bias: 0.017
Average variance: 0.048
Sklearn 0-1 loss: 0.042


In [96]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.142
Average bias: 0.140
Average variance: 0.040
Sklearn 0-1 loss: 0.138


In [97]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 100 (pendigits_ffd4)

In [32]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('pendigits_ffd4.csv')
disc = 'FFD'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split data:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=1 - train_ratio, random_state = 30, stratify = Y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 30, stratify = y_test)

# Check representation of class
print('Class representation - original: ', Counter(df_ffd4['class'])) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ',Counter(y_test)) 
print('Class representation - validation data: ',Counter(y_val))

print(x_train.shape, x_val.shape, x_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10992 entries, 0 to 10991
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      10992 non-null  int64
 1   A2      10992 non-null  int64
 2   A3      10992 non-null  int64
 3   A4      10992 non-null  int64
 4   A5      10992 non-null  int64
 5   A6      10992 non-null  int64
 6   A7      10992 non-null  int64
 7   A8      10992 non-null  int64
 8   A9      10992 non-null  int64
 9   A10     10992 non-null  int64
 10  A11     10992 non-null  int64
 11  A12     10992 non-null  int64
 12  A13     10992 non-null  int64
 13  A14     10992 non-null  int64
 14  A15     10992 non-null  int64
 15  A16     10992 non-null  int64
 16  class   10992 non-null  int64
dtypes: int64(17)
memory usage: 1.4 MB
(10992, 16) (10992,)
Class representation - original:  Counter({2: 1144, 4: 1144, 1: 1143, 0: 1143, 7: 1142, 6: 1056, 8: 1055, 5: 1055, 9: 1055, 3: 1055})
Class representation - training 

### Models, FFD, m=100

In [99]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       114
           1       0.91      0.92      0.92       115
           2       0.95      0.93      0.94       114
           3       0.93      0.94      0.93       106
           4       0.99      0.98      0.99       114
           5       0.98      0.95      0.97       105
           6       0.97      1.00      0.99       106
           7       0.93      0.97      0.95       114
           8       0.99      0.97      0.98       106
           9       0.93      0.92      0.92       106

    accuracy                           0.96      1100
   macro avg       0.96      0.96      0.96      1100
weighted avg       0.96      0.96      0.96      1100

Time for training model ID3 - default, FFD, m = 100 is: 2.0468451976776123.


In [100]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           0       0.85      0.92      0.89       114
           1       0.77      0.72      0.74       115
           2       0.82      0.91      0.86       114
           3       0.86      0.97      0.91       106
           4       1.00      0.96      0.98       114
           5       0.93      0.67      0.78       105
           6       0.96      0.98      0.97       106
           7       0.95      0.89      0.92       114
           8       0.89      0.74      0.80       106
           9       0.76      0.97      0.85       106

    accuracy                           0.87      1100
   macro avg       0.88      0.87      0.87      1100
weighted avg       0.88      0.87      0.87      1100

Time for training model Naive Bayes - default, FFD, m = 100 is: 0.018578767776489258.


In [33]:
# Knn-VDM complete code - TRY SAMPLE 500
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 62.0

In [102]:
# VALIDATION
scores = 'accuracy'
print(f'Validation result, {scores}, {disc}, k = {k}.')
# Create list of algorithms
models = []
models.append(('ID3', model_id3))
models.append(('CNB', model_nb))
# models.append(('Knn-VDM', KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')))

# Evaluate each model in turn
# get predicted prices on validation data
for name, model in models:
    val_pred = model.predict(x_val)
    val_results = metrics.accuracy_score(y_val, val_pred)
    print(f'{name}, validation accuracy:, {val_results}')

Validation result, accuracy, FFD, k = 10.
ID3, validation accuracy:, 0.9654126213592233
CNB, validation accuracy:, 0.8780339805825242


### Evaluation, FFD, m=100

In [103]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [104]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.053
Average bias: 0.015
Average variance: 0.049
Sklearn 0-1 loss: 0.042


In [105]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.135
Average bias: 0.128
Average variance: 0.035
Sklearn 0-1 loss: 0.126


In [106]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))