<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Sample_wise_and_Class_wise_Accuracy_for_Multilabel_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## import libraries

import torch
import numpy as np
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, hamming_loss

Ref: https://mmuratarat.github.io/2020-01-25/multilabel_classification_metrics

# Sample Wise Accuracy 


# 1. Subset Accuracy / Exact Match Ratio: 
The set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.

In [None]:
## Example 1

pred_logits = torch.tensor([ 1.,  0.3976, -1., 1., -1., -1., -1., -0.913 ])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([1., 1., 0., 1., 0., 0., 0., 0.])

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc = accuracy_score(gt, pred.detach().cpu().numpy())
print('Accuracy: ', acc)

pred_logits:  tensor([ 1.0000,  0.3976, -1.0000,  1.0000, -1.0000, -1.0000, -1.0000, -0.9130])
pred_sigmoid:  tensor([0.7311, 0.5981, 0.2689, 0.7311, 0.2689, 0.2689, 0.2689, 0.2864])
pred:  tensor([1., 1., 0., 1., 0., 0., 0., 0.])
gt:  [1. 1. 0. 1. 0. 0. 0. 0.]
Accuracy:  1.0


In [None]:
## Example 2

pred_logits = torch.tensor([ 1.,  0.3976, -1., 1., -1., -1., -1., -0.913 ])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([1., 0., 0., 1., 0., 0., 1., 0.])        ## Adjustment on grouth truth so that the correct label is 6/8

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc = accuracy_score(gt, pred.detach().cpu().numpy())
print('Accuracy: ', acc)

pred_logits:  tensor([ 1.0000,  0.3976, -1.0000,  1.0000, -1.0000, -1.0000, -1.0000, -0.9130])
pred_sigmoid:  tensor([0.7311, 0.5981, 0.2689, 0.7311, 0.2689, 0.2689, 0.2689, 0.2864])
pred:  tensor([1., 1., 0., 1., 0., 0., 0., 0.])
gt:  [1. 0. 0. 1. 0. 0. 1. 0.]
Accuracy:  0.75


In [None]:
## Example 3 (1 sample with 8 classes prediction)

pred_logits = torch.tensor([[ 1.,  0.3976, -1., 1., -1., -1., -1., -0.913 ]])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([[1., 0., 0., 1., 0., 0., 1., 0.]])        ## Adjustment on grouth truth so that the correct prediction is 6/8

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc = accuracy_score(gt, pred.detach().cpu().numpy())
print('Accuracy: ', acc)                                ## Accuracy = 0 

pred_logits:  tensor([[ 1.0000,  0.3976, -1.0000,  1.0000, -1.0000, -1.0000, -1.0000, -0.9130]])
pred_sigmoid:  tensor([[0.7311, 0.5981, 0.2689, 0.7311, 0.2689, 0.2689, 0.2689, 0.2864]])
pred:  tensor([[1., 1., 0., 1., 0., 0., 0., 0.]])
gt:  [[1. 0. 0. 1. 0. 0. 1. 0.]]
Accuracy:  0.0


In [None]:
## Example 4 (Multilabel: 3 samples, 4 classes)

pred_logits = torch.tensor([[ 1.,  0.3976,  -1., -0.913 ],[ 1.,  0.3976, -1., 1.],[  -1., 1., -1., -1.]])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([[1., 1., 0., 0.], [1., 1., 0., 1.], [0., 1., 0., 0.]])    

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc = accuracy_score(gt, pred.detach().cpu().numpy())
print('Accuracy: ', acc)

pred_logits:  tensor([[ 1.0000,  0.3976, -1.0000, -0.9130],
        [ 1.0000,  0.3976, -1.0000,  1.0000],
        [-1.0000,  1.0000, -1.0000, -1.0000]])
pred_sigmoid:  tensor([[0.7311, 0.5981, 0.2689, 0.2864],
        [0.7311, 0.5981, 0.2689, 0.7311],
        [0.2689, 0.7311, 0.2689, 0.2689]])
pred:  tensor([[1., 1., 0., 0.],
        [1., 1., 0., 1.],
        [0., 1., 0., 0.]])
gt:  [[1. 1. 0. 0.]
 [1. 1. 0. 1.]
 [0. 1. 0. 0.]]
Accuracy:  1.0


In [None]:
## Example 5 (Multilabel: 3 samples, 4 classes)

pred_logits = torch.tensor([[ 1.,  0.3976,  -1., -0.913 ],[ 1.,  0.3976, -1., 1.],[  -1., 1., -1., -1.]])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([[1., 0., 0., 0.], [1., 1., 0., 1.], [0., 1., 0., 0.]])    ## Adjustment on sample 1 (1 prediction is wrong), correct label = 2/3

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc = accuracy_score(gt, pred.detach().cpu().numpy())
print('Accuracy: ', acc)

pred_logits:  tensor([[ 1.0000,  0.3976, -1.0000, -0.9130],
        [ 1.0000,  0.3976, -1.0000,  1.0000],
        [-1.0000,  1.0000, -1.0000, -1.0000]])
pred_sigmoid:  tensor([[0.7311, 0.5981, 0.2689, 0.2864],
        [0.7311, 0.5981, 0.2689, 0.7311],
        [0.2689, 0.7311, 0.2689, 0.2689]])
pred:  tensor([[1., 1., 0., 0.],
        [1., 1., 0., 1.],
        [0., 1., 0., 0.]])
gt:  [[1. 0. 0. 0.]
 [1. 1. 0. 1.]
 [0. 1. 0. 0.]]
Accuracy:  0.6666666666666666


# 2. Example based / Instance based accuracy:

Accuracy for each instance is defined as the proportion of the predicted correct labels to the total number (predicted and actual) of labels for that instance. Overall accuracy is the average across all instances.

ref: https://www.researchgate.net/profile/Rachana-Buch/publication/327110772_A_Survey_on_Multi_Label_Classification/links/5bf56905299bf1124fe4aef2/A-Survey-on-Multi-Label-Classification.pdf

In [None]:
## Example 6 (Multilabel: 3 samples, 4 classes)

pred_logits = torch.tensor([[ 1.,  0.3976,  -1., -0.913 ],[ 1.,  0.3976, -1., 1.],[  -1., 1., -1., -1.]])
pred_sigmoid = torch.sigmoid(pred_logits)
pred = torch.round(pred_sigmoid)
gt = np.array([[1., 0., 0., 0.], [1., 1., 0., 1.], [0., 1., 0., 0.]])    ## Adjustment on sample 1 (1 prediction is wrong), correct label = 11/12

print('pred_logits: ', pred_logits)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred)
print('gt: ', gt)

acc_sample = []
for i in range(len(gt)):
    acc_per_sample = accuracy_score(gt[i], pred[i].detach().cpu().numpy())
    acc_sample.append(acc_per_sample)
print('Accuracy per sample:', acc_sample)
print('Accuracy: ', np.nanmean(acc_sample))

pred_logits:  tensor([[ 1.0000,  0.3976, -1.0000, -0.9130],
        [ 1.0000,  0.3976, -1.0000,  1.0000],
        [-1.0000,  1.0000, -1.0000, -1.0000]])
pred_sigmoid:  tensor([[0.7311, 0.5981, 0.2689, 0.2864],
        [0.7311, 0.5981, 0.2689, 0.7311],
        [0.2689, 0.7311, 0.2689, 0.2689]])
pred:  tensor([[1., 1., 0., 0.],
        [1., 1., 0., 1.],
        [0., 1., 0., 0.]])
gt:  [[1. 0. 0. 0.]
 [1. 1. 0. 1.]
 [0. 1. 0. 0.]]
Accuracy per sample: [0.75, 1.0, 1.0]
Accuracy:  0.9166666666666666


# Class-wise Accuracy

# 1. Class-wise confusion matrix: 
Compute class-wise (default) or sample-wise (samplewise=True) multilabel confusion matrix to evaluate the accuracy of a classification, and output confusion matrices for each class or sample.

In [None]:
# Using confusion matrix to compute TP, TN, FP and FN values, then calculate accuracy using (TP+TN)/(TP+TN+FP+TN)
gt = np.array([[1, 0, 1, 0, 1],
                  [0, 1, 0, 1, 0],
                  [1, 0, 0, 1, 0]])
pred = np.array([[-1.32, 0.65, 1.12, 0.02, -1.03],
                  [-1.01, -0.5, 0.0, 0.2, 0.04],
                  [1.23, 0.4, 0.5, 0.8, -0.4]]) 
pred_sigmoid = torch.sigmoid(torch.from_numpy(pred)) 
pred_sigmoid = np.array(pred_sigmoid)
threshold = 0.5 
pred_indicator = np.array(pred_sigmoid > threshold, dtype=float)

print('pred_logits: ', pred)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred_indicator)
print('gt: ', gt)

matrix = multilabel_confusion_matrix(gt, pred_indicator, samplewise=False)
print('class_wise_confusion_matrix:', matrix)
cls_acc = []
for i in range(len(matrix)):
  TN = matrix[i][0][0]
  FN = matrix[i][1][0]
  TP = matrix[i][1][1]
  FP = matrix[i][0][1]
  class_wise_accu = (TN + TP) / (TN+TP+FN+FP)
  cls_acc.append(class_wise_accu)
print('class_wise_acc:', cls_acc)
print('acc:', np.nanmean(cls_acc))

pred_logits:  [[-1.32  0.65  1.12  0.02 -1.03]
 [-1.01 -0.5   0.    0.2   0.04]
 [ 1.23  0.4   0.5   0.8  -0.4 ]]
pred_sigmoid:  [[0.21081829 0.65701046 0.75398872 0.50499983 0.2630841 ]
 [0.26697985 0.37754067 0.5        0.549834   0.50999867]
 [0.77381857 0.59868766 0.62245933 0.68997448 0.40131234]]
pred:  [[0. 1. 1. 1. 0.]
 [0. 0. 0. 1. 1.]
 [1. 1. 1. 1. 0.]]
gt:  [[1 0 1 0 1]
 [0 1 0 1 0]
 [1 0 0 1 0]]
class_wise_confusion_matrix: [[[1 0]
  [1 1]]

 [[0 2]
  [1 0]]

 [[1 1]
  [0 1]]

 [[0 1]
  [0 2]]

 [[1 1]
  [1 0]]]
class_wise_acc: [0.6666666666666666, 0.0, 0.6666666666666666, 0.6666666666666666, 0.3333333333333333]
acc: 0.4666666666666667


# Questions: Is overall class-wise accuracy = overall sample-wise accuracy?

In [None]:
## Example 1
gt = np.array([[1, 0, 1, 0, 1],
                  [1, 1, 0, 0, 0],
                  [1, 0, 1, 0, 0]])
pred = np.array([[-1.32, 0.65, 1.12, 0.02, -1.03],
                  [-1.01, -0.5, 0.0, 0.2, 0.04],
                  [1.23, 0.4, 0.5, 0.8, -0.4]]) 
pred_sigmoid = torch.sigmoid(torch.from_numpy(pred)) 
pred_sigmoid = np.array(pred_sigmoid)
threshold = 0.5 
pred_indicator = np.array(pred_sigmoid > threshold, dtype=float)

print('#################### Sample-wise Accuracy ####################')
acc_sample = []
for i in range(len(gt)):
    acc_per_sample = accuracy_score(gt[i], pred_indicator[i])
    acc_sample.append(acc_per_sample)
print('Accuracy per sample:', acc_sample)
print('Overall Accuracy (sample-wise): ', np.nanmean(acc_sample))

print('\n#################### Class-wise Accuracy ####################')
matrix = multilabel_confusion_matrix(gt, pred_indicator, samplewise=False)
# print('class_wise_confusion_matrix:', matrix)
cls_acc = []
for i in range(len(matrix)):
  TN = matrix[i][0][0]
  FN = matrix[i][1][0]
  TP = matrix[i][1][1]
  FP = matrix[i][0][1]
  class_wise_accu = (TN + TP) / (TN+TP+FN+FP)
  cls_acc.append(class_wise_accu)
print('Accuracy per sample:', cls_acc)
print('Overall Accuracy (class-wise):', np.nanmean(cls_acc))

#################### Sample-wise Accuracy ####################
Accuracy per sample: [0.2, 0.2, 0.6]
Overall Accuracy (sample-wise):  0.3333333333333333

#################### Class-wise Accuracy ####################
Accuracy per sample: [0.3333333333333333, 0.0, 1.0, 0.0, 0.3333333333333333]
Overall Accuracy (class-wise): 0.3333333333333333


In [None]:
## Example 2
gt = np.array([[1, 0, 1, 0, 1],
                  [1, 1, 0, 0, 0],
                  [1, 0, 1, 0, 0]])
pred = np.array([[-1.32, 0.65, -1.12, 0.02, -1.03],
                  [1.01, -0.5, 0.0, 1.56, -0.04],
                  [1.23, -0.4, 0.5, 0.8, 0.4]]) 
pred_sigmoid = torch.sigmoid(torch.from_numpy(pred)) 
pred_sigmoid = np.array(pred_sigmoid)
threshold = 0.5 
pred_indicator = np.array(pred_sigmoid > threshold, dtype=float)

print('#################### Sample-wise Accuracy ####################')
acc_sample = []
for i in range(len(gt)):
    acc_per_sample = accuracy_score(gt[i], pred_indicator[i])
    acc_sample.append(acc_per_sample)
print('Accuracy per sample:', acc_sample)
print('Overall Accuracy (sample-wise): ', np.nanmean(acc_sample))

print('\n#################### Class-wise Accuracy ####################')
matrix = multilabel_confusion_matrix(gt, pred_indicator, samplewise=False)
# print('class_wise_confusion_matrix:', matrix)
cls_acc = []
for i in range(len(matrix)):
  TN = matrix[i][0][0]
  FN = matrix[i][1][0]
  TP = matrix[i][1][1]
  FP = matrix[i][0][1]
  class_wise_accu = (TN + TP) / (TN+TP+FN+FP)
  cls_acc.append(class_wise_accu)
print('Accuracy per sample:', cls_acc)
print('Overall Accuracy (class-wise):', np.nanmean(cls_acc))

#################### Sample-wise Accuracy ####################
Accuracy per sample: [0.0, 0.6, 0.6]
Overall Accuracy (sample-wise):  0.39999999999999997

#################### Class-wise Accuracy ####################
Accuracy per sample: [0.6666666666666666, 0.3333333333333333, 0.6666666666666666, 0.0, 0.3333333333333333]
Overall Accuracy (class-wise): 0.39999999999999997


# Other evaluation metrics: Hamming score / Hamming loss
Hamming Score is the fraction of correct predictions compared to the total labels. This is similar to Accuracy, and in fact they are interchangeable. 

The Hamming loss is the fraction of labels that are incorrectly predicted.

ref: https://wiki.hasty.ai/metrics/hamming-score#:~:text=Hamming%20Score%20is%20the%20fraction,in%20fact%20they%20are%20interchangeable.

In [None]:
gt = np.array([[1, 0, 1, 0, 1],
                  [1, 1, 0, 0, 0],
                  [1, 0, 1, 0, 0]])
pred = np.array([[-1.32, 0.65, -1.12, 0.02, -1.03],
                  [1.01, -0.5, 0.0, 1.56, -0.04],
                  [1.23, -0.4, 0.5, 0.8, 0.4]]) 
pred_sigmoid = torch.sigmoid(torch.from_numpy(pred)) 
pred_sigmoid = np.array(pred_sigmoid)
threshold = 0.5 
pred_indicator = np.array(pred_sigmoid > threshold, dtype=float)

print('pred_logits: ', pred)
print('pred_sigmoid: ', pred_sigmoid)
print('pred: ', pred_indicator)
print('gt: ', gt)

print('Hamming Loss', hamming_loss(gt, pred_indicator))

# true_class_index = np.argwhere(gt==1)
# print('true_class_index', true_class_index) # the shape is (7, 2)

# correct = 0
# for i in range(true_class_index.shape[0]):
#   if pred_indicator[true_class_index[i][0], true_class_index[i][1]] == 1.0:
#     correct += 1
# print('correct', correct)
# print('total true classes', true_class_index.shape[0])
# acc = correct / true_class_index.shape[0]
# print('acc', acc)


pred_logits:  [[-1.32  0.65 -1.12  0.02 -1.03]
 [ 1.01 -0.5   0.    1.56 -0.04]
 [ 1.23 -0.4   0.5   0.8   0.4 ]]
pred_sigmoid:  [[0.21081829 0.65701046 0.24601128 0.50499983 0.2630841 ]
 [0.73302015 0.37754067 0.5        0.82635335 0.49000133]
 [0.77381857 0.40131234 0.62245933 0.68997448 0.59868766]]
pred:  [[0. 1. 0. 1. 0.]
 [1. 0. 0. 1. 0.]
 [1. 0. 1. 1. 1.]]
gt:  [[1 0 1 0 1]
 [1 1 0 0 0]
 [1 0 1 0 0]]
Hamming Loss 0.6
true_class_index [[0 0]
 [0 2]
 [0 4]
 [1 0]
 [1 1]
 [2 0]
 [2 2]]
correct 3
total true classes 7
acc 0.42857142857142855
