# Evaluation

In [108]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [109]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [110]:
iris = load_iris()
X, y, labels = iris.data, iris.target, iris.target_names

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

### Dummy classifier 

In [112]:
from sklearn.dummy import DummyClassifier
# always predicts the most frequent label in the training set 
clf = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
print("###########################")
print("Dummy, most frequent")
print("Prediction: ", clf.predict(X_test)[:20])
print("train accuracy= {:.3%}".format(clf.score (X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score (X_test, y_test)))

# generates predictions by respecting the training set’s class distribution
clf = DummyClassifier(strategy='stratified').fit(X_train, y_train)
print("###########################")
print("Dummy, stratified")
print("Prediction: ", clf.predict(X_test)[:20])
print("train accuracy= {:.3%}".format(clf.score (X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score (X_test, y_test)))

# enerates predictions uniformly at random
clf = DummyClassifier(strategy='uniform').fit(X_train, y_train)
print("###########################")
print("Dummy, uniform")
print("Prediction: ", clf.predict(X_test)[:20])
print("train accuracy= {:.3%}".format(clf.score (X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score (X_test, y_test)))

#  always predicts a constant label
clf = DummyClassifier(strategy='constant', constant = 2).fit(X_train, y_train)
print("###########################")
print("Dummy, uniform")
print("Prediction: ", clf.predict(X_test)[:20])
print("train accuracy= {:.3%}".format(clf.score (X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score (X_test, y_test)))


###########################
Dummy, most frequent
Prediction:  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
train accuracy= 34.821%
test accuracy= 28.947%
###########################
Dummy, stratified
Prediction:  [1 0 2 1 0 0 1 2 2 2 1 1 0 2 0 2 2 0 2 0]
train accuracy= 36.607%
test accuracy= 31.579%
###########################
Dummy, uniform
Prediction:  [2 1 2 1 0 0 2 0 2 0 1 0 1 1 2 0 2 1 1 1]
train accuracy= 33.036%
test accuracy= 28.947%
###########################
Dummy, uniform
Prediction:  [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
train accuracy= 33.929%
test accuracy= 31.579%


### Metrics

In [113]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [115]:
max_depth = 4
clf = DecisionTreeClassifier(
    criterion=  'entropy',
    random_state=42,        
    max_depth=max_depth,
).fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print ('accuracy = {:.2}'.format(accuracy_score(y_test, y_predicted)))
print ('recall = ', recall_score(y_test, y_predicted, average=None))
print ('precision = ', precision_score(y_test, y_predicted, average=None))
print ('f1_score  = ', f1_score(y_test, y_predicted, average=None))


accuracy = 1.0
recall =  [1. 1. 1.]
precision =  [1. 1. 1.]
f1_score  =  [1. 1. 1.]


In [134]:
from sklearn.datasets import load_breast_cancer

In [135]:
cancer = load_breast_cancer()

In [136]:
X, y, labels, features = cancer.data, cancer.target, cancer.target_names, cancer.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [141]:
clf = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print ('accuracy = {:.2}'.format(accuracy_score(y_test, y_predicted)))
print ('precision = {:.2}'.format(precision_score(y_test, y_predicted)))
print ('recall = {:.2}'.format(recall_score(y_test, y_predicted)))
print ('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))

accuracy = 0.95
precision = 0.96
recall = 0.97
f1_score  = 0.96


In [158]:
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier

In [159]:
wine = load_wine()
X, y, labels, features = wine.data, wine.target, wine.target_names, wine.feature_names
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [163]:
clf = RandomForestClassifier().fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print ('accuracy = {:.2}'.format(accuracy_score(y_test, y_predicted)))
print ('precision = ', precision_score(y_test, y_predicted, average='weighted'))
print ('recall = ', recall_score(y_test, y_predicted, average='weighted'))
print ('f1_score  = ', f1_score(y_test, y_predicted, average='weighted'))

accuracy = 0.98
precision =  0.9794871794871796
recall =  0.9777777777777777
f1_score  =  0.9779047619047618
