# Comparação de Classificadores

Classificadores a serem testados: KNN, SVM linear, SVM RBF, Decision Tree, Random Forest e Neural Network.

Métrica de avaliação: Accuracy, Confusion Matrix.

Links úteis:

http://scikit-learn.org/stable/modules/classes.html

http://scikit-learn.org/stable/modules/model_evaluation.html

In [1]:
# Updating scikit-image.
!pip install -U scikit-image

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-image
  Downloading scikit_image-0.19.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-image
  Attempting uninstall: scikit-image
    Found existing installation: scikit-image 0.18.3
    Uninstalling scikit-image-0.18.3:
      Successfully uninstalled scikit-image-0.18.3
Successfully installed scikit-image-0.19.3


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

from skimage.feature import hog, daisy
from torchvision.datasets import MNIST

%matplotlib inline

def mount_dataset(dataset, n_samples):

    img_list = []
    feat_list = []
    lab_list = []

    # Iterating over a subset of 1000 training images and extracting features.
    for i in range(n_samples):

        img = np.array(dataset[i][0]) # From PIL to numpy image.
        lab = dataset[i][1] # Recovering sample label.

        # Linearized pixels as features (naive).
        feat = img.ravel()

        '''TO DO: Extraia características usando descritores mais representativos
        que os pixels linearizados (i.e. HOG, Daisy).'''

        feat = feat.ravel()

        # Updating lists.
        img_list.append(img)
        feat_list.append(feat)
        lab_list.append(lab)

    imgs = np.asarray(img_list)
    feats = np.asarray(feat_list)
    labs = np.asarray(lab_list)

    return imgs, feats, labs

# Classifier names.
names = [
    'KNN',
    'Naive Bayes',
]

# Presetting classifiers.
'''TO DO: Instancie os outros classificadores descritos no cabeçalho desse
notebook. Lembre-se de adicionar o nome do classificador na lista `names'
e de importar os pacotes necessários (i.e. sklearn.ensemble, sklearn.svm,
etc). Para mais informações sobre os algoritmos de classificação do sklearn:
<https://scikit-learn.org/stable/supervised_learning.html>'''

classifiers = [
    KNeighborsClassifier(n_neighbors=5),
    GaussianNB(),
]

'''TO DO: Modifique os parâmetros dos algoritmos e insira novos classificadores
modernos (i.e. AdaBoost, Quadratic Discriminant Analysis, XGBoost, etc).'''

# Instantiating datasets from torchvision.
trn_dataset = MNIST(root='./',
                    train=True,
                    download=True)
tst_dataset = MNIST(root='./',
                    train=False)
n_classes = 10

''' TO DO: Teste em outros datasets um pouco mais desafiadores como o EMNIST,
KMNIST e FasionMNIST. Todos eles estão disponiveis no torchvision com uso bem
similar à interface do MNIST. Para mais informações sobre como carregar
datasets do pytorch, acesse:
<https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>'''

# Recovering training and test sets.
trn_imgs, trn_feats, trn_labs = mount_dataset(trn_dataset, n_samples=1000)
tst_imgs, tst_feats, tst_labs = mount_dataset(tst_dataset, n_samples=1000)

'''TO DO: Teste com 100, 1000 e 10000 samples no treino. Observe como se
comportam os diferentes algoritmos de aprendizado com mais e menos amostras.'''

'''TO DO: Separe 20% do conjunto de treino para realizar validação e execute um
grid search para achar os melhores parâmetros de cada classificador.'''

print('Training set')
print('    image tensor', trn_imgs.shape)
print('    feature tensor', trn_feats.shape)
print('    label tensor', trn_labs.shape)

print('Test set')
print('    image tensor', tst_imgs.shape)
print('    feature tensor', tst_feats.shape)
print('    label tensor', tst_labs.shape)

# Iterate over classifiers.
for clf_cnt, (clf_name, clf) in enumerate(zip(names, classifiers)):
    
    print('------------------------------------')
    print('------------------------------------')
    print('------------------------------------')
    print('    ', 'Classifier', clf_name)
    
    # Fitting classifier to train data.
    clf.fit(trn_feats, trn_labs)
    
    # Obtaining class prediction for training data.
    prd_trn = clf.predict(trn_feats)
    
    # Obtaining class prediction for unseen data.
    prd_tst = clf.predict(tst_feats)
    
    # Computing error metrics in the training data.
    acc_trn = metrics.accuracy_score(trn_labs, prd_trn)
    cm_trn = metrics.confusion_matrix(trn_labs, prd_trn, normalize='true')
    
    # Computing error metrics in the unseen data.
    acc_tst = metrics.accuracy_score(tst_labs, prd_tst)
    cm_tst = metrics.confusion_matrix(tst_labs, prd_tst, normalize='true')
    
    # Printing error metrics.
    print('        Accuracy Train: %.4f, Test: %.4f' % (acc_trn, acc_tst))

    # Displaying confusion matrices.
    disp_trn = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_trn,
                                              display_labels=[str(i) for i in range(n_classes)])
    disp_tst = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_tst,
                                              display_labels=[str(i) for i in range(n_classes)])
    
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    
    disp_trn.plot(ax=ax[0])
    ax[0].set_title('Train')
    
    disp_tst.plot(ax=ax[1])
    ax[1].set_title('Test')
    
    plt.show()

    '''TO DO: Avalie a partir das métricas disponíveis os desempenhos dos
    algoritmos nos conjuntos de treino e teste, observando quais algoritmos
    com quais parâmetros eles overfitaram ou underfitaram.'''

In [None]:
n_show_samples = 10

perm = np.random.permutation(tst_feats.shape[0])

fig, ax = plt.subplots(n_show_samples, len(classifiers), figsize=(3 * len(classifiers), 3 * n_show_samples))

# Iterating over classifiers.
for clf_cnt, (clf_name, clf) in enumerate(zip(names, classifiers)):
    
    # Predicting from subset of `n_show_samples' samples.
    tst_prds = clf.predict(tst_feats[perm[:n_show_samples], :])
    
    # Plotting images.
    for i in range(tst_prds.shape[0]):
        
        img = tst_imgs[perm[i]]
        lab = tst_labs[perm[i]]
        prd = tst_prds[i]

        ax[i, clf_cnt].imshow(img, cmap='gray')
        ax[i, clf_cnt].set_yticks([])
        ax[i, clf_cnt].set_xticks([])
        ax[i, clf_cnt].set_title('%s: Class %d, Pred %d' % (clf_name, lab, prd))

plt.tight_layout()
plt.show()