# Imports

In [1]:
import numpy as np
import sklearn.metrics
from sklearn import linear_model
from sklearn.datasets import load_breast_cancer

# Load Data

"Breast Cancer" is a tiny dataset for binary classification

In [2]:
features, targets = load_breast_cancer(return_X_y=True)

In [3]:
print('Features')
print('shape:', features.shape)
print('data:')
print(features)

Features
shape: (569, 30)
data:
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]


In [4]:
print('Targets')
print('shape:', targets.shape)
print('data:')
print(targets)

Targets
shape: (569,)
data:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0

# Model

Create super simple logistic classifier

In [5]:
model = linear_model.LogisticRegression(solver='liblinear')
model.fit(features, targets)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

Predicted outputs

In [6]:
predictions = model.predict(features)
print(predictions)

[0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1
 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 0 0 

# Metrics

**Confusion Matrix**

```python
'      Confusion matrix layout'
'          PREDICTED LABEL'
'               0     1  '
'TRUE  0'  [[  TN    FP  ]
'LABEL 1'   [  FN    TP  ]]
```

Confusion matrix in sklearn

In [7]:
cm = sklearn.metrics.confusion_matrix(targets, predictions)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[198  14]
 [  9 348]]


Confusion matrix in pure numpy

In [8]:
def confusion_matrix(y_true, y_pred, result=None, nb_classes=None, norm='none'):
    """Compute confusion matrix. Works with NumPy and PyTorch tensors seamlessly"""
    assert y_true.shape == y_pred.shape
    
    if nb_classes==None:
        nb_classes = int(max(y_true.max(), y_pred.max())) + 1
        
    if result is None:
        confusion_matrix = np.zeros((nb_classes, nb_classes), dtype=np.long)
    else:
        confusion_matrix = result
    
    for true_class_idx in range(nb_classes):
        y_pred_for_class = y_pred[y_true==true_class_idx]
        for pred_class_idx in range(nb_classes):
            tmp = (y_pred_for_class==pred_class_idx).sum()
            confusion_matrix[true_class_idx, pred_class_idx] = tmp
        
    if norm == 'none':
        return confusion_matrix  # return raw
    elif norm == 'row':
        return confusion_matrix / confusion_matrix.sum(axis=1, keepdims=True)  # rows sum to 1
    elif norm == 'col':
        return confusion_matrix / confusion_matrix.sum(axis=0, keepdims=True)  # cols sum to 1
    else:
        raise ValueError('norm must be "none", "row" or "col"')

In [9]:
cm = confusion_matrix(targets, predictions)
print(cm)

[[198  14]
 [  9 348]]


Confusion matrix manually for 2-class problem

In [12]:
pred_for_neg = predictions[targets==0]  # predictions for class #1
pred_for_pos = predictions[targets==1]  # predictions for class #2
TN = np.sum(pred_for_neg==0)
FP = np.sum(pred_for_neg==1)
FN = np.sum(pred_for_pos==0)
TP = np.sum(pred_for_pos==1)
cm = np.array([[TN, FP],
               [FN, TP]])
print(cm)

[[198  14]
 [  9 348]]


Per class classification accuracy

In [13]:
cm_true = cm / cm.sum(axis=1, keepdims=True)
print(cm_true)

[[0.93396226 0.06603774]
 [0.02521008 0.97478992]]


Per class accuracy for true classes only

In [14]:
cm_true.diagonal()

array([0.93396226, 0.97478992])

**Precision and Recall**

In sklearn

In [15]:
print('Accuracy:  ', sklearn.metrics.accuracy_score(targets, predictions))
print('Precision: ', sklearn.metrics.precision_score(targets, predictions))
print('Recall:    ', sklearn.metrics.recall_score(targets, predictions))

Accuracy:   0.9595782073813708
Precision:  0.9613259668508287
Recall:     0.9747899159663865


In numpy

In [16]:
# each cm row is actual class
assert cm.shape == (2, 2)
(TN, FP) = cm[0]
(FN, TP) = cm[1]

print('Accuracy: ', (TP+TN) / np.sum(cm))
print('Precision:', TP / (TP+FP))
print('Recall:   ', TP / (TP+FN))

Accuracy:  0.9595782073813708
Precision: 0.9613259668508287
Recall:    0.9747899159663865


And manually from confusion matrix

In [17]:
print('Accuracy: ', cm.trace() / cm.sum() )

Accuracy:  0.9595782073813708
