In [1]:
import pickle
import pandas as pd
from utils import *

# Model Metrics

In [29]:
def calculate_metrics(file, true_labels, to_glob=False):
    """
    Calculate and print metrics for given file and labels 
    Metrics: roc_auc, cohen's kappa, f1, recall, precision
    :param file: file to calculate metrics for
    :param true_labels: ground truth labels
    :param to_glob: if True, calculate metrics for all subdirectories of file param
    
    """
    globbed = False
    if to_glob:
        run_names = glob.glob(os.path.join(file, '*results.json'))
        globbed = True
    else:
        run_names = [file]
        
    for file in run_names:
        pred_classes_, pred_probs_ = get_predictions(file)
        num_videos = min(len(pred_classes_), len(true_labels))
        labels = true_labels[:num_videos]
        pred_classes = pred_classes_[:num_videos]
        pred_probs = pred_probs_[:num_videos]
        macro_avg_auc, roc_aucs = calculate_auc(pred_probs, labels)
        roc_aucs = np.array(roc_aucs)
        print("\nfile", file)
        print("macro average roc_auc:", sum(roc_aucs) / 4)
        print("roc_auc per class:", roc_aucs)
        print("\n", classification_report(labels, pred_classes))
        print("\n", confusion_matrix(labels, pred_classes))
        
def calculate_auc(pred_probs, labels, num_classes=4):
    """
    Calculates AUC given a list of true labels and predicted class probabilities
    Computes macro-average ROC curve and ROC area
    :param pred_probs: predicted class probabilities
    :param labels: prediction labels
    :param num_classes: number of prediction classes
    :return: predicted class for one video
    """
    num_videos = len(pred_probs)
    label_list = []
    pred_list = []
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(num_videos):
        label = labels[i]
        video_pred = pred_probs[i]
        for clip in video_pred:
            label_list.append(label)
            pred_list.append(clip)        
    pred_list = np.array(pred_list)
    one_hot_true_labels = np.eye(num_classes)[label_list]

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(one_hot_true_labels[:, i], pred_list[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    roc_aucs = []
    for score in roc_auc:
        roc_aucs.append(roc_auc[score])
    return sum(roc_aucs) / 4, roc_aucs

In [28]:
# Example of results for all classes (with real dataset)
exam_df = pd.read_csv('data/gait_labels.csv')
y = exam_df['Label'].tolist()
calculate_metrics('jobs/default', y, to_glob=True)


file jobs/default/results.json
macro average roc_auc: 0.827447540233394
roc_auc per class: [0.94863137 0.85840494 0.67765027 0.82510358]

               precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       0.80      0.80      0.80         5
           2       0.67      0.50      0.57         4
           3       0.75      0.75      0.75         4

    accuracy                           0.78        18
   macro avg       0.76      0.76      0.76        18
weighted avg       0.77      0.78      0.77        18


 [[5 0 0 0]
 [1 4 0 0]
 [0 1 2 1]
 [0 0 1 3]]
