**оценка классификатора**


Евгений Борисов borisov.e@solarl.ru

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

---

In [None]:
data = pd.read_csv('../data/uci/wine.csv.gz',header=None)
print(len(data))


data.columns = [
    'class',
    'Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline',
    ]

data.sample(2)

In [None]:
data.groupby('class').count()[['Alcohol']]

In [None]:
data['y'] = data['class'].map({1:1,2:0,3:1}) 

In [None]:
data.groupby('y').count()[['Alcohol']]

---

In [None]:
X = data[[    
    'Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315 of diluted wines',
    'Proline',
    ]].values

y = data['y'].values

X.shape,y.shape

---

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.8)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
%xdel X
%xdel y
%xdel data

---

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

# from sklearn.neighbors import KNeighborsClassifier
# clf = KNeighborsClassifier(metric='euclidean')

clf.fit(X_train,y_train)

In [None]:
r = clf.predict(X_test)


---

In [None]:
from sklearn.metrics import  accuracy_score

accuracy_score(y_test,r)

In [None]:
from sklearn.metrics import classification_report

print( classification_report(y_test,r) )

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,r)
print(cm)

In [None]:
import itertools

plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)

plt.title('Confusion matrix')
# plt.colorbar()

classes=['neg','pos']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

---

In [None]:
p = clf.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

fpr, tpr, thresholds = roc_curve( y_test, p[:,1] )
roc_auc = auc(fpr,tpr)

plt.figure()
plt.grid(True)
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

---