# Classification models

Dataset: satimage<br>
By: Sam<br>
Update at: 29/06/2022

====

Summary:

- Import unsupervised discretised datasets (already encoded categorical attributes)
- Split dataset: No random split, split exactly as teh intial order (please read About Dataset)
- Perform classification models: ID3, Naive Bayes. Knn-VDM is not applicable for the whole dataset due to long time training (brute force algorithm)
- No cross validation (specified in the About dataset)
- Evaluation performance metrics: Bias, Variance


### About Dataset
NUMBER OF ATTRIBUTES: 36 (= 4 spectral bands x 9 pixels in neighbourhood) the pixels read out in sequence left-to-right and top-to-bottom. 

    - A1-A4: 4 top-left
    - A5-A8: 4 top middle
    - A9-A12: 4 top-right
    => central pixel are given by attributes 17,18,19 and 20

NUMBER OF EXAMPLES:

	- training set     4435
	- test set         2000
    
ATTRIBUTES: The attributes are numerical, in the range 0 to 255.
CLASS: 
	There are 6 decision classes: 1,2,3,4,5 and 7.

**!!! NB. There are no examples with class 6 in this dataset-they have all been removed because of doubts about the 
	validity of this class.
    
**!!! NB. DO NOT USE CROSS-VALIDATION WITH THIS DATASET !!!
- Just train and test only once with the above training and test sets.
- The data is given in random order and certain lines of data have been removed so you cannot reconstruct the original image from this dataset.

In [1]:
import pandas as pd
from pandas import read_csv
from pandas import set_option
import numpy as np
from numpy import arange
## EDA
from collections import Counter

In [2]:
# Pre-processing
from sklearn.preprocessing import OrdinalEncoder
# Cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score # 1 metric
from sklearn.model_selection import cross_validate # more than 1 metric
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# RIPPER (https://pypi.org/project/wittgenstein/) Only for binary
import wittgenstein as lw 

In [4]:
# For Naive Bayes
from sklearn.naive_bayes import CategoricalNB # Categorical Naive Bayes
from sklearn.naive_bayes import MultinomialNB # Multinominal Naive Bayes (suitable for NLP)
from mixed_naive_bayes import MixedNB # Mixed Naive Bayes for combination of both discrete & continuous feature

In [5]:
# For decision tree ID3 
# https://stackoverflow.com/questions/61867945/python-import-error-cannot-import-name-six-from-sklearn-externals
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
from id3 import Id3Estimator # ID3 Decision Tree (https://pypi.org/project/decision-tree-id3/)
from id3 import export_graphviz

In [6]:
# Knn-VDM 3
from vdm3 import ValueDifferenceMetric
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# For model evaluation
from sklearn.metrics import classification_report
from sklearn import metrics
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. EWD data

## 1.1 EWD, k = 4 (satimage_ewd1)

In [9]:
# Complete code for data preperation
# Read data
df_ewd1 = pd.read_csv('satimage_ewd1.csv')
disc = 'EWD'
k = 4

df_ewd1.info()
data = df_ewd1.values
data.shape

features = df_ewd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

In [10]:
# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

(4435, 36) (4435,)
(2000, 36) (2000,)
Class representation - original:  Counter({1: 1533, 7: 1508, 3: 1358, 5: 707, 2: 703, 4: 626})
Class representation - training data:  Counter({1: 1072, 7: 1038, 3: 961, 2: 479, 5: 470, 4: 415})
Class representation - testing data:  Counter({7: 470, 1: 461, 3: 397, 5: 237, 2: 224, 4: 211})


In [11]:
pd.DataFrame(x_train, columns=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,3,3,3,2,2,2,2,1,2,2,...,3,2,3,3,3,2,2,3,2,1
1,2,2,2,1,2,2,2,1,2,2,...,3,2,2,3,2,1,2,2,2,1
2,2,2,2,1,2,2,2,1,2,2,...,2,1,2,2,2,1,2,2,2,1
3,2,2,2,1,2,2,2,1,2,2,...,2,1,2,2,2,1,2,2,2,1
4,2,2,2,1,2,2,2,1,2,2,...,2,1,2,2,2,1,2,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4430,1,1,2,2,1,1,2,2,1,1,...,2,2,1,2,2,2,1,2,2,1
4431,1,1,2,2,1,1,2,2,1,2,...,2,2,1,2,2,1,1,2,2,1
4432,1,1,2,2,1,2,2,1,1,2,...,2,1,1,2,2,1,1,2,2,1
4433,1,2,2,1,1,2,2,1,2,2,...,2,1,1,2,2,1,1,2,2,1


In [12]:
# pd.DataFrame(x_train, columns=None).to_csv('x_train_edw1.csv', index=False)

In [13]:
# pd.DataFrame(x_test, columns=None).to_csv('x_test_edw1.csv', index=False)

In [14]:
# pd.DataFrame(y_train, columns=None).to_csv('y_train_edw1.csv', index=False)

In [15]:
# pd.DataFrame(y_test, columns=None).to_csv('y_test_edw1.csv', index=False)

### Models, EWD, k=4

In [16]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.95      0.93      0.94       461
           2       0.94      0.96      0.95       224
           3       0.85      0.88      0.87       397
           4       0.51      0.50      0.50       211
           5       0.77      0.78      0.77       237
           7       0.79      0.77      0.78       470

    accuracy                           0.82      2000
   macro avg       0.80      0.81      0.80      2000
weighted avg       0.82      0.82      0.82      2000

Time for training model ID3 - default, EWD, k = 4 is: 1.379641056060791.


In [17]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.90      0.76      0.83       461
           2       0.99      0.90      0.94       224
           3       0.88      0.82      0.85       397
           4       0.45      0.74      0.56       211
           5       0.60      0.59      0.59       237
           7       0.76      0.73      0.74       470

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.75      2000
weighted avg       0.79      0.76      0.77      2000

Time for training model Naive Bayes - default, EWD, k = 4 is: 0.02179098129272461.


In [20]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           2       0.97      0.96      0.97       117
           3       0.87      0.99      0.92       205
           4       0.64      0.56      0.60        57
           5       0.62      0.71      0.66        41
           7       0.74      0.50      0.60        80

    accuracy                           0.83       500
   macro avg       0.77      0.74      0.75       500
weighted avg       0.83      0.83      0.82       500

Time for training model Knn-VDM, EWD, k = 4 is: 597.7573320865631.


### Evaluation, EDW, k = 4

In [15]:
from sklearn.metrics import zero_one_loss
#This library is used to decompose bias and variance in our models
from mlxtend.evaluate import bias_variance_decomp
import warnings
warnings.filterwarnings('ignore')

In [20]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.185
Average bias: 0.140
Average variance: 0.119
Sklearn 0-1 loss: 0.175


In [21]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.239
Average bias: 0.239
Average variance: 0.012
Sklearn 0-1 loss: 0.241


In [14]:
# # Knn - SAMPLE 500 !!! TO LONG
# import time
# start = time.time() # For measuring time execution

# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train[:500,:], y_train[:500], x_test[:500,:], y_test[:500],
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# end = time.time()
# print(f'Time for evaluation model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

## 1.2 EWD, k = 7 (satimage_ewd2)

In [23]:
# Complete code for data preperation
# Read data
df_ewd2 = pd.read_csv('satimage_ewd2.csv')
disc = 'EWD'
k = 7

df_ewd2.info()
data = df_ewd2.values
data.shape

features = df_ewd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, EWD, k=7

In [24]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.94      0.95      0.94       461
           2       0.95      0.96      0.95       224
           3       0.86      0.86      0.86       397
           4       0.50      0.46      0.48       211
           5       0.84      0.81      0.82       237
           7       0.79      0.82      0.80       470

    accuracy                           0.83      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.83      0.83      0.83      2000

Time for training model ID3 - default, EWD, k = 7 is: 1.5133397579193115.


In [25]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.90      0.85      0.87       461
           2       0.99      0.92      0.95       224
           3       0.91      0.85      0.88       397
           4       0.47      0.70      0.57       211
           5       0.65      0.68      0.67       237
           7       0.83      0.76      0.80       470

    accuracy                           0.80      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.82      0.80      0.81      2000

Time for training model Naive Bayes - default, EWD, k = 7 is: 0.01701498031616211.


In [16]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyError: 0.0

### Evaluation, EWD, k=7

In [27]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.174
Average bias: 0.115
Average variance: 0.129
Sklearn 0-1 loss: 0.165


In [28]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.201
Average bias: 0.200
Average variance: 0.012
Sklearn 0-1 loss: 0.200


In [29]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## 1.3 EWD, k = 10, satimage_ewd3

In [28]:
# Complete code for data preperation
# Read data
df_ewd3 = pd.read_csv('satimage_ewd3.csv')
disc = 'EWD'
k = 10

df_ewd3.info()
data = df_ewd3.values
data.shape

features = df_ewd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, EWD, k=10

In [20]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.94      0.95      0.94       461
           2       0.94      0.95      0.94       224
           3       0.85      0.84      0.85       397
           4       0.54      0.51      0.52       211
           5       0.78      0.78      0.78       237
           7       0.79      0.81      0.80       470

    accuracy                           0.83      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.83      0.83      0.83      2000

Time for training model ID3 - default, EWD, k = 10 is: 1.7287828922271729.


In [21]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.87      0.89       461
           2       0.99      0.91      0.95       224
           3       0.90      0.88      0.89       397
           4       0.48      0.70      0.57       211
           5       0.68      0.70      0.69       237
           7       0.84      0.74      0.79       470

    accuracy                           0.81      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.83      0.81      0.82      2000

Time for training model Naive Bayes - default, EWD, k = 10 is: 0.021162986755371094.


In [29]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyError: 9.0

### Evaluation, EWD, k=10

In [34]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.174
Average bias: 0.105
Average variance: 0.137
Sklearn 0-1 loss: 0.172


In [35]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 9 is out of bounds for axis 1 with size 9

In [36]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# 2. EFD

## 2.1 EFD, k = 4 (satimage_efd1)

In [26]:
# Complete code for data preperation
# Read data
df_efd1 = pd.read_csv('satimage_efd1.csv')
disc = 'EFD'
k = 4

df_efd1.info()
data = df_efd1.values
data.shape

features = df_efd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, EFD, k=4

In [38]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.92      0.94      0.93       461
           2       0.94      0.94      0.94       224
           3       0.83      0.84      0.83       397
           4       0.54      0.56      0.55       211
           5       0.79      0.76      0.78       237
           7       0.82      0.80      0.81       470

    accuracy                           0.83      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.83      0.83      0.83      2000

Time for training model ID3 - default, EFD, k = 4 is: 1.3556320667266846.


In [39]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.84      0.87       461
           2       0.94      0.89      0.91       224
           3       0.90      0.88      0.89       397
           4       0.50      0.69      0.58       211
           5       0.62      0.71      0.66       237
           7       0.85      0.73      0.79       470

    accuracy                           0.80      2000
   macro avg       0.79      0.79      0.78      2000
weighted avg       0.82      0.80      0.81      2000

Time for training model Naive Bayes - default, EFD, k = 4 is: 0.01708078384399414.


In [27]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           2       0.98      0.98      0.98       117
           3       0.94      0.98      0.96       205
           4       0.70      0.77      0.73        57
           5       0.67      0.90      0.77        41
           7       0.88      0.57      0.70        80

    accuracy                           0.89       500
   macro avg       0.84      0.84      0.83       500
weighted avg       0.89      0.89      0.88       500

Time for training model Knn-VDM, EFD, k = 4 is: 599.2019200325012.


### Evaluation, EFD, k=4

In [41]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.185
Average bias: 0.121
Average variance: 0.138
Sklearn 0-1 loss: 0.173


In [42]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.200
Average bias: 0.200
Average variance: 0.012
Sklearn 0-1 loss: 0.202


In [43]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## 2.2 EFD, k = 7 (satimage_efd2)

In [30]:
# Complete code for data preperation
# Read data
df_efd2 = pd.read_csv('satimage_efd2.csv')
disc = 'EFD'
k = 7

df_efd2.info()
data = df_efd2.values
data.shape

features = df_efd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, EFD, k=7

In [45]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.95      0.92      0.94       461
           2       0.95      0.95      0.95       224
           3       0.85      0.88      0.86       397
           4       0.56      0.51      0.53       211
           5       0.79      0.79      0.79       237
           7       0.80      0.83      0.81       470

    accuracy                           0.83      2000
   macro avg       0.82      0.81      0.81      2000
weighted avg       0.83      0.83      0.83      2000

Time for training model ID3 - default, EFD, k = 7 is: 1.5397441387176514.


In [46]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.90      0.85      0.88       461
           2       1.00      0.91      0.95       224
           3       0.90      0.88      0.89       397
           4       0.48      0.73      0.58       211
           5       0.67      0.70      0.68       237
           7       0.85      0.73      0.79       470

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.79      2000
weighted avg       0.83      0.80      0.81      2000

Time for training model Naive Bayes - default, EFD, k = 7 is: 0.018496036529541016.


In [31]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           2       0.98      0.97      0.97       117
           3       0.90      1.00      0.94       205
           4       0.80      0.65      0.72        57
           5       0.57      0.85      0.69        41
           7       0.90      0.55      0.68        80

    accuracy                           0.87       500
   macro avg       0.83      0.80      0.80       500
weighted avg       0.88      0.87      0.86       500

Time for training model Knn-VDM, EFD, k = 7 is: 582.9510357379913.


### Evaluation, EFD, k=7

In [48]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.170
Average bias: 0.096
Average variance: 0.135
Sklearn 0-1 loss: 0.165


In [49]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.196
Average bias: 0.195
Average variance: 0.013
Sklearn 0-1 loss: 0.196


In [50]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## EFD, k =10

In [32]:
# Complete code for data preperation
# Read data
df_efd3 = pd.read_csv('satimage_efd3.csv')
disc = 'EFD'
k = 10

df_efd3.info()
data = df_efd3.values
data.shape

features = df_efd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, EFD, k=10

In [52]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {k} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.94      0.95      0.95       461
           2       0.94      0.96      0.95       224
           3       0.87      0.85      0.86       397
           4       0.53      0.54      0.53       211
           5       0.82      0.76      0.79       237
           7       0.79      0.81      0.80       470

    accuracy                           0.83      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.83      0.83      0.83      2000

Time for training model ID3 - default, EFD, k = 10 is: 1.8621201515197754.


In [53]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, k = {k} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.89      0.90       461
           2       0.99      0.91      0.95       224
           3       0.91      0.89      0.90       397
           4       0.48      0.72      0.58       211
           5       0.69      0.73      0.71       237
           7       0.85      0.72      0.78       470

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.80      2000
weighted avg       0.84      0.81      0.82      2000

Time for training model Naive Bayes - default, EFD, k = 10 is: 0.01699519157409668.


In [33]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, k = {k} is: {end - start}.') # Total time execution

KeyError: 0.0

### Evaluation, EFD, k=10

In [55]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.174
Average bias: 0.102
Average variance: 0.137
Sklearn 0-1 loss: 0.166


In [56]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.186
Average bias: 0.184
Average variance: 0.014
Sklearn 0-1 loss: 0.186


In [57]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

# FFD

## FFD, m =10, satimage_ffd1

In [34]:
# Complete code for data preperation
# Read data
df_ffd1 = pd.read_csv('satimage_ffd1.csv')
disc = 'FFD'
m = 10

df_ffd1.info()
data = df_ffd1.values
data.shape

features = df_ffd1.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, FFD, m=10

In [59]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.95      0.96      0.96       461
           2       0.98      0.94      0.96       224
           3       0.85      0.86      0.86       397
           4       0.52      0.53      0.52       211
           5       0.88      0.80      0.84       237
           7       0.82      0.84      0.83       470

    accuracy                           0.85      2000
   macro avg       0.83      0.82      0.83      2000
weighted avg       0.85      0.85      0.85      2000

Time for training model ID3 - default, FFD, k = 10 is: 3.8470358848571777.


In [60]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

IndexError: index 74 is out of bounds for axis 1 with size 74

In [35]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 29.0

### Evaluation, FFD, m=10

In [62]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.165
Average bias: 0.097
Average variance: 0.131
Sklearn 0-1 loss: 0.153


In [63]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

IndexError: index 52 is out of bounds for axis 1 with size 52

In [64]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 30, satimage_ffd2

In [36]:
# Complete code for data preperation
# Read data
df_ffd2 = pd.read_csv('satimage_ffd2.csv')
disc = 'FFD'
m = 30

df_ffd2.info()
data = df_ffd2.values
data.shape

features = df_ffd2.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, FFD, m=30

In [66]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.95      0.96      0.96       461
           2       0.95      0.94      0.94       224
           3       0.85      0.86      0.86       397
           4       0.52      0.53      0.52       211
           5       0.88      0.79      0.83       237
           7       0.82      0.84      0.83       470

    accuracy                           0.85      2000
   macro avg       0.83      0.82      0.82      2000
weighted avg       0.85      0.85      0.85      2000

Time for training model ID3 - default, FFD, k = 30 is: 3.83943510055542.


In [67]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.89      0.90       461
           2       0.99      0.90      0.94       224
           3       0.89      0.90      0.90       397
           4       0.48      0.71      0.57       211
           5       0.75      0.70      0.72       237
           7       0.84      0.75      0.80       470

    accuracy                           0.82      2000
   macro avg       0.81      0.81      0.80      2000
weighted avg       0.84      0.82      0.82      2000

Time for training model Naive Bayes - default, FFD, m = 30 is: 0.01922774314880371.


In [37]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 19.0

### Evaluation, FFD, m=30

In [69]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.165
Average bias: 0.098
Average variance: 0.132
Sklearn 0-1 loss: 0.154


In [70]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.182
Average bias: 0.183
Average variance: 0.021
Sklearn 0-1 loss: 0.182


In [71]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 60, satimage_ffd3

In [38]:
# Complete code for data preperation
# Read data
df_ffd3 = pd.read_csv('satimage_ffd3.csv')
disc = 'FFD'
m = 60

df_ffd3.info()
data = df_ffd3.values
data.shape

features = df_ffd3.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, FFD, m=60

In [11]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, k = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.94      0.95      0.95       461
           2       0.95      0.94      0.94       224
           3       0.85      0.86      0.86       397
           4       0.52      0.53      0.53       211
           5       0.85      0.79      0.82       237
           7       0.83      0.84      0.84       470

    accuracy                           0.84      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.84      0.84      0.84      2000

Time for training model ID3 - default, FFD, k = 60 is: 4.08695387840271.


In [12]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.89      0.90       461
           2       0.99      0.91      0.94       224
           3       0.89      0.89      0.89       397
           4       0.48      0.71      0.57       211
           5       0.74      0.70      0.72       237
           7       0.85      0.75      0.80       470

    accuracy                           0.82      2000
   macro avg       0.81      0.81      0.80      2000
weighted avg       0.84      0.82      0.82      2000

Time for training model Naive Bayes - default, FFD, m = 60 is: 0.023862123489379883.


In [39]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 16.0

### Evaluation, FFD, m=60

In [16]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.165
Average bias: 0.098
Average variance: 0.131
Sklearn 0-1 loss: 0.156


In [17]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.181
Average bias: 0.182
Average variance: 0.021
Sklearn 0-1 loss: 0.182


In [18]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))

## FFD, m = 100, satimage_ffd4

In [40]:
# Complete code for data preperation
# Read data
df_ffd4 = pd.read_csv('satimage_ffd4.csv')
disc = 'FFD'
m = 100

df_ffd4.info()
data = df_ffd4.values
data.shape

features = df_ffd4.drop('class', axis = 1).columns

# separate the data into X and y
X = data[:, : len(features)]
Y = data[:,-1]

print(X.shape, Y.shape)

# Split train test (!!NB: Split dataset exactly as the original order, see about dataset)
x_train = X[: 4435, :]
x_test = X[4435:, :] 
y_train = Y[: 4435]
y_test = Y[4435:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Check representation of class
print('Class representation - original: ', Counter(Y)) 
print('Class representation - training data: ', Counter(y_train)) 
print('Class representation - testing data: ', Counter(y_test)) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A1      6435 non-null   int64
 1   A2      6435 non-null   int64
 2   A3      6435 non-null   int64
 3   A4      6435 non-null   int64
 4   A5      6435 non-null   int64
 5   A6      6435 non-null   int64
 6   A7      6435 non-null   int64
 7   A8      6435 non-null   int64
 8   A9      6435 non-null   int64
 9   A10     6435 non-null   int64
 10  A11     6435 non-null   int64
 11  A12     6435 non-null   int64
 12  A13     6435 non-null   int64
 13  A14     6435 non-null   int64
 14  A15     6435 non-null   int64
 15  A16     6435 non-null   int64
 16  A17     6435 non-null   int64
 17  A18     6435 non-null   int64
 18  A19     6435 non-null   int64
 19  A20     6435 non-null   int64
 20  A21     6435 non-null   int64
 21  A22     6435 non-null   int64
 22  A23     6435 non-null   int64
 23  A24     6435 

### Models, FFD, m=100

In [20]:
# ID3 - Default
import time
start = time.time() # For measuring time execution

model_id3 = Id3Estimator()
model_id3.fit(x_train, y_train)
# Testing
y_pred_id3 = model_id3.predict(x_test)
print(classification_report(y_test, y_pred_id3))

end = time.time()
print(f'Time for training model ID3 - default, {disc}, m = {m} is: {end - start}.') # Total time execution


              precision    recall  f1-score   support

           1       0.93      0.96      0.95       461
           2       0.94      0.92      0.93       224
           3       0.88      0.87      0.87       397
           4       0.56      0.57      0.56       211
           5       0.83      0.78      0.80       237
           7       0.83      0.84      0.84       470

    accuracy                           0.85      2000
   macro avg       0.83      0.82      0.83      2000
weighted avg       0.85      0.85      0.85      2000

Time for training model ID3 - default, FFD, k = 100 is: 3.48462176322937.


In [21]:
# Naive Bayes - Default
import time
start = time.time() # For measuring time execution
model_nb = CategoricalNB()
model_nb.fit(x_train, y_train)
# Testing
y_pred_nb = model_nb.predict(x_test)
model_nb.classes_
print(classification_report(y_test, y_pred_nb))
end = time.time()
print(f'Time for training model Naive Bayes - default, {disc}, m = {m} is: {end - start}.') # Total time execution

              precision    recall  f1-score   support

           1       0.91      0.88      0.90       461
           2       0.98      0.91      0.94       224
           3       0.90      0.90      0.90       397
           4       0.48      0.71      0.57       211
           5       0.74      0.71      0.72       237
           7       0.85      0.75      0.79       470

    accuracy                           0.82      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.84      0.82      0.82      2000

Time for training model Naive Bayes - default, FFD, m = 100 is: 0.016165971755981445.


In [41]:
# Knn-VDM complete code - 500 instance
import time
start = time.time() # For measuring time execution

# specific the continuous columns index if any
vdm = ValueDifferenceMetric(x_train[:500,:], y_train[:500], continuous = None)
vdm.fit()
# Knn model, n_neigbour = 3, metrics = vdm
knn_vdm = KNeighborsClassifier(n_neighbors=3, metric=vdm.get_distance, algorithm='brute')
## Fit model
knn_vdm.fit(x_train[:500,:], y_train[:500])
# Testing
y_pred_knn = knn_vdm.predict(x_test[:500,:])
knn_vdm.classes_
print(classification_report(y_test[:500], y_pred_knn))

end = time.time()
print(f'Time for training model Knn-VDM, {disc}, m = {m} is: {end - start}.') # Total time execution

KeyError: 10.0

### Evaluation, FFD, m=100

In [23]:
# ID3
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_id3, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_id3))

Average expected loss: 0.166
Average bias: 0.095
Average variance: 0.132
Sklearn 0-1 loss: 0.152


In [24]:
# Naive Bayes
avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
model_nb, x_train, y_train, x_test, y_test,
loss='0-1_loss',
random_seed=123)
#---
print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_nb))

Average expected loss: 0.181
Average bias: 0.181
Average variance: 0.020
Sklearn 0-1 loss: 0.183


In [25]:
# # Knn
# avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
# knn_vdm, x_train, y_train, x_test, y_test,
# loss='0-1_loss',
# random_seed=123)
# #---
# print('Average expected loss: %.3f' % avg_expected_loss)
# print('Average bias: %.3f' % avg_bias)
# print('Average variance: %.3f' % avg_var)
# print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred_knn))