# Avaliação dos modelos

#### 1 - Bibliotecas

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors

import graphviz
from sklearn.tree import export_graphviz

#### 2 - Carregando a base de dados sobre frutas

In [None]:
fruits = pd.read_table('./CSV/fruit_data_with_colors.txt')

X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

fruits.head()


## Validação Cruzada

#### 3 - Validação Cruzada

In [None]:
from sklearn.model_selection import cross_val_score

clf = KNeighborsClassifier(n_neighbors = 5)
X = X_fruits_2d.values
y = y_fruits_2d.values
cv_scores = cross_val_score(clf, X, y, cv=5)

print('Validação cruzada:', cv_scores)
print('Média: {:.3f}'
     .format(np.mean(cv_scores)))

#### 4 - Curva de Validação

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3, 3, 4)
train_scores, test_scores = validation_curve(SVC(C=1), X, y,
                                            param_name='gamma',
                                            param_range=param_range, cv=5)

print(train_scores)

print(test_scores)

#### 5 - Plot da Validação

In [None]:
# Exemplo do scikit-learn: validation_plot 
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure()

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2

plt.semilogx(param_range, train_scores_mean, label='Escore de Treinamento',
            color='darkorange', lw=lw)

plt.fill_between(param_range, train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, alpha=0.2,
                color='darkorange', lw=lw)

plt.semilogx(param_range, test_scores_mean, label='Escore da validação cruzada',
            color='navy', lw=lw)

plt.fill_between(param_range, test_scores_mean - test_scores_std,
                test_scores_mean + test_scores_std, alpha=0.2,
                color='navy', lw=lw)

plt.legend(loc='best')
plt.show()

## Árvores de Decisão


#### 6 - Importar dataset Íris e executar árvore de decisão

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Acurácia da árvore de decisão no conjunto de treinamento: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Acurácia da árvore de decisão no conjunto de teste: {:.2f}'
     .format(clf.score(X_test, y_test)))

#### 7 - Profundidade da árvore (evitar overfitting)

In [None]:
clf2 = DecisionTreeClassifier(min_samples_leaf = 10).fit(X_train, y_train)


print('Acurácia da árvore de decisão no conjunto de treinamento: {:.2f}'
     .format(clf2.score(X_train, y_train)))
print('Acurácia da árvore de decisão no conjunto de teste: {:.2f}'
     .format(clf2.score(X_test, y_test)))

#### 8 - Visualizando árvores de decisão

In [None]:
def plot_decision_tree(clf, feature_names, class_names):
    export_graphviz(clf, out_file="adspy_temp.dot", feature_names=feature_names, class_names=class_names, filled = True, impurity = False)
    with open("adspy_temp.dot") as f:
        dot_graph = f.read()
    return graphviz.Source(dot_graph)

plot_decision_tree(clf, iris.feature_names, iris.target_names)

#### 9 - Pré-podagem

In [None]:
plot_decision_tree(clf2, iris.feature_names, iris.target_names)

#### 10 - Importância da Característica

In [None]:
def plot_feature_importances(clf, feature_names):
    c_features = len(feature_names)
    plt.barh(range(c_features), clf.feature_importances_)
    plt.xlabel("Importância")
    plt.ylabel("Característica")
    plt.yticks(np.arange(c_features), feature_names)

In [None]:
plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf2, iris.feature_names)
plt.show()

print('Importâncias: {}'.format(clf2.feature_importances_))

## Classificadores dummy e Base de dados pré-carregadas

#### 11 - Carregando base de dados (digits)

In [None]:
from sklearn.datasets import load_digits

dataset = load_digits()
X, y = dataset.data, dataset.target

for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

#### 12 - Transformando a base de dados para não-balanceada

In [None]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('Original:\t', y[1:30])
print('Novo:\t', y_binary_imbalanced[1:30])

#### 13 - Verificando proporções da base de dados não-balanceada

In [None]:
np.bincount(y_binary_imbalanced)

#### 14 - Treinando um classificador SVC

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

#### 15 - Classificadores "bôbos" Dummy

In [None]:
from sklearn.dummy import DummyClassifier

DummyClassifier?

dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)

y_dummy_predictions = dummy_majority.predict(X_test)

y_dummy_predictions

#### 16 - Escore de teste do classificador dummy

In [None]:
dummy_majority.score(X_test, y_test)

#### 17 - Carregar novo classificador SVC linear e verificar escore de teste 

In [None]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

### Matrizes de Confusão (Confusion matrices)


True Negative (TN)  | False Positive (FP)

False Negative (FN) | True Positive (TP)


#### 18 - Matrizes de confusão binária

In [None]:
from sklearn.metrics import confusion_matrix

dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_majority_predicted = dummy_majority.predict(X_test)
confusion = confusion_matrix(y_test, y_majority_predicted)

print('Classe mais frequente\n', confusion)

#### 19 - Matriz de confusão - classificador Dummy

In [None]:
dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)
y_classprop_predicted = dummy_classprop.predict(X_test)
confusion = confusion_matrix(y_test, y_classprop_predicted)

print('Estratificado\n', confusion)

#### 20 - Matriz de confusão - SVC

In [None]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm_predicted = svm.predict(X_test)
confusion = confusion_matrix(y_test, svm_predicted)

print('SVC (kernel linear, C=1)\n', confusion)

#### 21 - Matriz de confusão - Regressão Logística

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_train, y_train)
lr_predicted = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_predicted)

print('Regressão Logística\n', confusion)

#### 22 - Matriz de confusão - Árvore de decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
tree_predicted = dt.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)

print('Árvore de decisão (max_depth = 2)\n', confusion)

## Métricas de avaliação para classificação binária


### Acurácia = TP + TN / (TP + TN + FP + FN)
###  Precisão = TP / (TP + FP) aka PPV (Positive predictive value)
###  Recall = TP / (TP + FN)  aka TPR (True Positive Rate)
###  F1 = 2 * Precisão * Recall / (Precisão + Recall) 

#### 23 - Computando métricas

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Acurácia: {:.2f}'.format(accuracy_score(y_test, tree_predicted)))
print('Precisão: {:.2f}'.format(precision_score(y_test, tree_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted)))
print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))

#### 24 - Reporte combinado

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))

#### 25 - Outros reportes combinados

In [None]:
print('Estratificado (dummy)\n', 
      classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1']))
print('SVM\n', 
      classification_report(y_test, svm_predicted, target_names = ['not 1', '1']))
print('Regressão Logística\n', 
      classification_report(y_test, lr_predicted, target_names = ['not 1', '1']))
print('Árvore de Decisão\n', 
      classification_report(y_test, tree_predicted, target_names = ['not 1', '1']))

### Funções de Decisão


#### 26 - Função de decisão

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)
y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20]))

y_score_list

#### 27 - Função Proba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)
y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1]))

y_proba_list

#### 28 - Curvas de Precisão-Recall

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]

plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Curva de Precisão-Recall')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precisão', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()

#### 29 - Área abaixo da curva ROC (Característica de Operação do Receptor)


In [None]:
from sklearn.metrics import roc_curve, auc

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='Regressão Logística (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('Taxa FP', fontsize=16)
plt.ylabel('Taxa TP', fontsize=16)
plt.title('Curva ROC (classificador de digitos)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

#### 30 - Curvas ROC

In [None]:
from matplotlib import cm

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
for g in [0.01, 0.1, 0.20, 1]:
    svm = SVC(gamma=g).fit(X_train, y_train)
    y_score_svm = svm.decision_function(X_test)
    fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm)
    roc_auc_svm = auc(fpr_svm, tpr_svm)
    accuracy_svm = svm.score(X_test, y_test)
    print("gamma = {:.2f}  accuracy = {:.2f}   AUC = {:.2f}".format(g, accuracy_svm, 
                                                                    roc_auc_svm))
    plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, 
             label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm))

    
    
plt.xlabel('Taxa FP', fontsize=16)
plt.ylabel('Taxa TP (Recall)', fontsize=16)
plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--')
plt.legend(loc="lower right", fontsize=11)
plt.title('Curvas ROC', fontsize=16)
plt.axes().set_aspect('equal')

plt.show()