# Practical session ML: Classification

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn import datasets, neighbors, metrics, tree

def class_accs(predicted_labels, test_labels):
    indices0 = []
    indices1 = []
    for i, label in enumerate(test_labels):
        if label == 0:
            indices0.append(i)
        else:
            indices1.append(i)
    accuracy0 = metrics.accuracy_score([0] * len(indices0), predicted_labels[indices0])
    accuracy1 = metrics.accuracy_score([1] * len(indices1), predicted_labels[indices1])
    return accuracy0, accuracy1

def print_metrics(predicted_labels, test_labels):
    f1 = metrics.f1_score(test_labels, predicted_labels)
    accuracy = metrics.accuracy_score(test_labels, predicted_labels)
    
    accuracy0, accuracy1 = class_accs(predicted_labels, test_labels)

    print('\tF1 = {}'.format(f1))
    print('\tAccuracy = {}'.format(accuracy))
    print('\t\tclass 0: {}'.format(accuracy0))
    print('\t\tclass 1: {}'.format(accuracy1))
    print()

Load the Wisconsin breast cancer data set.

In [2]:
# load the Wisconsin breast cancer data set
wisconsin = datasets.load_breast_cancer()

# explore the data set
print('Data set contains {} instances with {} features.'.format(wisconsin['data'].shape[0], wisconsin['data'].shape[1]))
print('The different classes are {}.'.format(wisconsin['target_names']))

count0 = len([x for x in wisconsin['target'] if x == 0])
count1 = len([x for x in wisconsin['target'] if x == 1])
print('{} contains {} samples, {} contains {}.'.format(wisconsin['target_names'][0], count0, wisconsin['target_names'][1], count1))

Data set contains 569 instances with 30 features.
The different classes are ['malignant' 'benign'].
malignant contains 212 samples, benign contains 357.


As an example, we will train a decision tree classifier on this data set:

In [3]:
dt = tree.DecisionTreeClassifier()
dt.fit(wisconsin['data'], wisconsin['target'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

## Illustration

Divide the dataset in five folds for cross-validation. Use stratified CV, meaning that the original class distribution is respected.

In [4]:
# split into classes
malignant = []
benign = []
for x, y in zip(wisconsin['data'], wisconsin['target']):
    if y == 0:
        malignant.append(x)
    else:
        benign.append(x)

random.seed(1)
random.shuffle(malignant)
random.shuffle(benign)

In [5]:
# try n-NN classification with k-fold CV
folds = 5
k = 3
bal_accs = []
for i in range(folds):
    start_malignant = i * len(malignant) // folds
    end_malignant = (i+1) * len(malignant) // folds
    
    start_benign = i * len(benign) // folds
    end_benign = (i+1) * len(benign) // folds
    
    training_malignant = malignant[:start_malignant] + malignant[end_malignant:]
    training_benign = benign[:start_benign] + benign[end_benign:]
    
    training_set = training_malignant + training_benign
    training_labels = [0 for _ in training_malignant] + [1 for _ in training_benign]
    
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    knn.fit(training_set, training_labels)
    
    test_malignant = malignant[start_malignant:end_malignant]
    test_benign = benign[start_benign:end_benign]
    test_set = test_malignant + test_benign
    test_labels = [0 for _ in test_malignant] + [1 for _ in test_benign]
    predicted_labels = knn.predict(test_set)

    acc0, acc1 = class_accs(predicted_labels, test_labels)
    bal_acc = (acc0 + acc1) / 2.
    bal_accs.append(bal_acc)
print('Balanced accuracy: {}'.format(np.mean(bal_accs)))

Balanced accuracy: 0.9129005297937496


For SVM and Decision Trees we can use the following code:

In [6]:
# create a decision tree
dt = tree.DecisionTreeClassifier()
# dt.fit(training_set, training_labels)



## Exercise 1

Compare the performance of kNN and Decision Trees on the Wisconsin data set. Vary the internal parameters (number of nearest neighbors, impurity measure,  etc.). Consult the [API documentation](http://scikit-learn.org/stable/modules/classes.html) for more details.