# Classifiers evaluation

http://scikit-learn.org/stable/modules/multiclass.html

In [1]:
from scipy.io import arff
import pandas as pd
import itertools

%pylab inline

Populating the interactive namespace from numpy and matplotlib


### ARFF file loading

In [2]:
arff_data = arff.loadarff('/Applications/weka-3-8-1/data/iris.arff')
df = pd.DataFrame(arff_data[0])
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Applications/weka-3-8-1/data/iris.arff'

### Sample сonversion

In [None]:
target = df['class'] 
data = df.drop(columns = ['class'])
target = target.astype('str')

### Confusion matrix plot function

http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
 

In [None]:
def plot_confusion_matrix(matrix, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
 """
 This function prints and plots the confusion matrix.
 """
 if normalize:
     matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
     print("Normalized confusion matrix")
 else:
     print('Confusion matrix, without normalization')

 pyplot.imshow(matrix, interpolation='nearest', cmap=cmap)
 pyplot.title(title)
 pyplot.colorbar()
 tick_marks = np.arange(len(classes))
 pyplot.xticks(tick_marks, classes, rotation=45)
 pyplot.yticks(tick_marks, classes)

 fmt = '.2f' if normalize else 'd'
 thresh = matrix.max() / 2.
 for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
     plt.text(j, i, format(matrix[i, j], fmt),
     horizontalalignment="center",
     color="white" if matrix[i, j] > thresh else "black")

 pyplot.tight_layout()
 pyplot.ylabel('True label')
 pyplot.xlabel('Predicted label')


# Baseline

http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics


In [None]:
from sklearn import dummy, metrics

baseline = dummy.DummyClassifier(strategy='most_frequent')
baseline.fit (data, target)
base_predictions = baseline.predict(data)

accuracy = metrics.accuracy_score(target, base_predictions)
print ("Accuracy = {:.3f}".format(accuracy))

print(metrics.classification_report(target, base_predictions))

plot_confusion_matrix(metrics.confusion_matrix(target, base_predictions), 
 classes = target.unique(), 
 title='Iris most frequent baseline')


# Train-test spliting

http://scikit-learn.org/stable/modules/cross_validation.html

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split


In [None]:
from sklearn import model_selection as ms

train_data, test_data, train_labels, test_labels = ms.train_test_split(data, target, test_size = 0.3)
print ('Train data size: {} instances \nTest data size: {} instances'.format(len(train_data), len(test_data)))

### Decision tree training

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
from sklearn import tree

iris_split_tree_model = tree.DecisionTreeClassifier(random_state=1, min_samples_leaf=2)
iris_split_tree_model.fit(train_data, train_labels)

In [None]:
def print_tree (tree): 
 print ('Tree depth: {} \n'.format(tree.tree_.max_depth))
 n_nodes = tree.tree_.node_count
 children_left = tree.tree_.children_left
 children_right = tree.tree_.children_right
 feature = tree.tree_.feature
 threshold = tree.tree_.threshold


 # The tree structure can be traversed to compute various properties such
 # as the depth of each node and whether or not it is a leaf.
 node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
 is_leaves = np.zeros(shape=n_nodes, dtype=bool)
 stack = [(0, -1)] # seed is the root node id and its parent depth
 while len(stack) > 0:
 node_id, parent_depth = stack.pop()
 node_depth[node_id] = parent_depth + 1

 # If we have a test node
 if (children_left[node_id] != children_right[node_id]):
 stack.append((children_left[node_id], parent_depth + 1))
 stack.append((children_right[node_id], parent_depth + 1))
 else:
 is_leaves[node_id] = True

 print("The binary tree structure has %s nodes and has "
 "the following tree structure:"
 % n_nodes)
 for i in range(n_nodes):
 if is_leaves[i]:
 print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
 else:
 print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
 "node %s."
 % (node_depth[i] * "\t",
 i,
 children_left[i],
 feature[i],
 threshold[i],
 children_right[i],
 ))

In [None]:
print_tree (iris_split_tree_model)

### Evaluate train-test tree classifier


In [None]:
split_tree_predictions = iris_split_tree_model.predict(test_data)

split_tree_accuracy = metrics.accuracy_score(test_labels, split_tree_predictions)

print ("Accuracy = {:.3f}".format(split_tree_accuracy))

print(metrics.classification_report(test_labels, split_tree_predictions))

plot_confusion_matrix(metrics.confusion_matrix(test_labels, split_tree_predictions), 
 classes = test_labels.unique(), 
 title='Iris test traim split tree')


# Cross validation

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate

http://scikit-learn.org/stable/modules/model_evaluation.html#multimetric-scoring



### Use scoring 

In [None]:
iris_cv_tree_model = tree.DecisionTreeClassifier(random_state=1, min_samples_leaf=2)

folds = 3

acc_scorer = metrics.make_scorer(metrics.accuracy_score)
recall_scorer = metrics.make_scorer(metrics.recall_score, average='weighted')
prec_scorer = metrics.make_scorer(metrics.precision_score, average='weighted')
f1_scorer = metrics.make_scorer(metrics.f1_score, average='weighted')

scoring = {'accuracy': acc_scorer, 
 'recall': recall_scorer, 
 'precision' : prec_scorer,
 'f1': f1_scorer}

cv_tree_model_scores = ms.cross_validate (iris_cv_tree_model,
 data,
 target,
 scoring=scoring,
 cv=folds,
 return_train_score=True)

np.set_printoptions(formatter={'float': '{: 0.3f}'.format})

print('Evaluation results for {} folds'.format(folds))

for (k,v) in cv_tree_model_scores .items():
 print(('{}: {}').format(k,v))



### Get tree cross-validation predictions

In [None]:
iris_cv_predict_tree_model = tree.DecisionTreeClassifier(random_state=1, min_samples_leaf=2)

cv_tree_predictions = ms.cross_val_predict(iris_cv_predict_tree_model, data, target)

cv_tree_accuracy = metrics.accuracy_score(target, cv_tree_predictions)
print ("Accuracy = {:.3f}".format(cv_tree_accuracy))

print(metrics.classification_report(target, cv_tree_predictions))

plot_confusion_matrix(metrics.confusion_matrix(target, cv_tree_predictions), 
 classes = test_labels.unique(), 
 title='Iris cross-validation tree')

# Grid search for Support vector machine

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

http://scikit-learn.org/stable/modules/svm.html

http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [None]:
train_data, test_data, train_labels, test_labels = ms.train_test_split(data, target, test_size = 0.2)

In [None]:
from sklearn import svm

folds = 3

acc_scorer = metrics.make_scorer(metrics.accuracy_score)

hyper_params_svc = [{'kernel': ['rbf'],
 'gamma': [1e-3, 1e-4],
 'C': [1, 10, 100, 1000]},
 {'kernel': ['linear'], 
 'C': [1, 10, 100, 1000]}]

classifier_svc = svm.SVC()

svm_grid = ms.GridSearchCV(
 classifier_svc, 
 hyper_params_svc, # parameters to tune via cross validation
 refit=True, # fit using all available data at the end, on the best found param combination
 scoring=acc_scorer, 
 cv=ms.StratifiedKFold(n_splits=folds)
)

svm_grid_best_model = svm_grid.fit(train_data, train_labels)

print("Best hyper-parameters for accuracy:")
print(svm_grid_best_model.best_params_)


In [None]:
predictions_svm = svm_grid_best_model.predict(test_data)

svm_grid_best_model_accuracy = metrics.accuracy_score(test_labels, predictions_svm)
print ("Accuracy = {:.3f}".format(svm_grid_best_model_accuracy))

print (metrics.classification_report(test_labels, predictions_svm))

plot_confusion_matrix(metrics.confusion_matrix(test_labels, predictions_svm), 
 classes = test_labels.unique(), 
 title='Iris best hyper-params SVC')
