In [None]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
from sklearn.metrics import precision_score, recall_score, f1_score
from scipy import stats
import numpy as np
import math
import matplotlib.pyplot as plt

In [None]:
def select_labels_by_threshold(scores, threshold=0.3):
    
    ranked_labels = sorted([(l,s) for l,s in enumerate(scores)], reverse=True, key=lambda x:x[-1])
    selected_labels = [l for l,s in ranked_labels if s >= threshold]
    return selected_labels

In [None]:
def select_labels_by_k(scores, k=3):
    
    ranked_labels = sorted([(l,s) for l,s in enumerate(scores)], reverse=True, key=lambda x:x[-1])
    selected_labels = [l for l,s in ranked_labels][:k]
    return selected_labels

In [None]:
def select_labels_by_dcg(scores, min_diff=0.1):
    
    ranked_labels = sorted([(l,s) for l,s in enumerate(scores)], reverse=True, key=lambda x:x[-1])
    
    prev_dcg = 0
    pos = len(ranked_labels)
    for j,(l,s) in enumerate(ranked_labels):
        new_dcg = prev_dcg + ((pow(2,s)-1) / math.log2(j + 2))
        if new_dcg - prev_dcg < min_diff:
            pos = j
            break
        prev_dcg = new_dcg
    
    selected_labels = [l for l,s in ranked_labels][:j]
    return selected_labels

In [None]:
def _precision_single(y_true, y_pred):
    
    precision = 0
    for l in y_pred:
        if l in y_true:
            precision += 1
    
    if len(y_pred) > 0:
        precision /= len(y_pred)
    else:
        precision = np.nan
    return precision

In [None]:
def _recall_single(y_true, y_pred):
    
    recall = 0
    for l in y_true:
        if l in y_pred:
            recall += 1
    
    if len(y_true) > 0:
        recall /= len(y_true)
    else:
        recall = np.nan
    return recall

In [None]:
def calculate_label_frequencies(y_true):
    
    freqs = [0 for i in range(len(y_true[0]))]
    for yt in y_true:
        for i,l in enumerate(yt):
            freqs[i] += l
    
    freqs = [f/len(y_true) for f in freqs]
    return freqs

In [None]:
def _get_bias(metric_array_x, metric_array_y, print_plot=True, xlabel='', ylabel=''):
    
    labels = [i for i,x in enumerate(metric_array_x)]
    sorted_arrays = sorted(list(zip(metric_array_x, metric_array_y, labels)))
    metric_array_x = [x for x,y,l in sorted_arrays]
    metric_array_y = [y for x,y,l in sorted_arrays]
    labels = [l for x,y,l in sorted_arrays]
    
    z = np.polyfit(metric_array_x, metric_array_y, 1)
    p = np.poly1d(z)
    
    angle = math.atan(z[0])
    intercept_on_y = p(0)
    imbalance_bias_coefficient, p_value = stats.pearsonr(metric_array_x, metric_array_y)
    
    if print_plot:
        fig, ax = plt.subplots()
        title = ('Angle with x axis: ' + str(round(math.degrees(angle),2)) + '°' +
                 '\nPearson Correlation: ' + str(round(imbalance_bias_coefficient,2)) +
                 '\nIntercept on y axis: ' + str(round(intercept_on_y,2)))
        ax.set_title(title)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_xlim([0,1])
        ax.set_ylim([0,1])
        ax.scatter(metric_array_x, metric_array_y)
        for x, y, l in zip(metric_array_x, metric_array_y, labels):
            ax.annotate(l, (x, y))
        ax.plot(metric_array_x, p(metric_array_x),"r--")
        plt.show()
    
    return imbalance_bias_coefficient, intercept_on_y

In [None]:
def compute_imbalance_bias_metrics(metrics, training_frequencies, print_plots=True):
    
    bias_metrics = {}
    metrics_array = []
    metrics.pop('average')
    for l in sorted(metrics.keys()):
        metrics_array.append((metrics[l]['true_frequency'], 
                              metrics[l]['R_precision'],
                              metrics[l]['precision'],
                              metrics[l]['recall'],
                              metrics[l]['fscore']))
    
    frequencies = [x[0] for x in metrics_array]
    r_precisions = [x[1] for x in metrics_array]
    precisions = [x[2] for x in metrics_array]
    recalls = [x[3] for x in metrics_array]
    fscores = [x[4] for x in metrics_array]
       
    bias_metrics['R_precision_IBC'], bias_metrics['R_precision_at_null_frequency'] = _get_bias(training_frequencies, r_precisions,
                                                                                               print_plot=print_plots,
                                                                                               xlabel='frequency in training', 
                                                                                               ylabel='R-Precision')
    bias_metrics['precision_IBC'], bias_metrics['precision_at_null_frequency'] = _get_bias(training_frequencies, precisions,
                                                                                           print_plot=print_plots,
                                                                                           xlabel='frequency in training', 
                                                                                           ylabel='Precision')
    bias_metrics['recall_IBC'], bias_metrics['recall_at_null_frequency'] = _get_bias(training_frequencies, recalls,
                                                                                     print_plot=print_plots,
                                                                                     xlabel='frequency in training', 
                                                                                     ylabel='Recall')
    bias_metrics['fscore_IBC'], bias_metrics['fscore_at_null_frequency'] = _get_bias(training_frequencies, fscores,
                                                                                     print_plot=print_plots,
                                                                                     xlabel='frequency in training', 
                                                                                     ylabel='F-score')
    return bias_metrics

In [None]:
def compute_binary_classification_metrics_per_class(y_true, y_score, included_labels=range(14), 
                                                    label_selection_method='dcg', dcg_min_diff=0.1, threshold=0.3, k=3):
    
    assert label_selection_method in ('dcg', 'threshold', 'k')
    if label_selection_method == 'dcg':
        assert dcg_min_diff
    elif label_selection_method == 'threshold':
        assert threshold
    elif label_selection_method == 'k':
        assert k
        
    metrics = {l:{} for l in included_labels}
    
    for binary_label in included_labels:
        true_labels_binary = []
        R_predicted_labels_binary = []
        predicted_labels_binary = []
        
        for i in range(len(y_true)):
            yt = y_true[i]
            ys = y_score[i]
            
            all_true_labels = [l for l,v in enumerate(yt) if v]
            binary_true_label = 1 if binary_label in all_true_labels else 0
            
            all_R_predicted_labels = select_labels_by_k(ys, k=len(all_true_labels))
            binary_R_predicted_label = 1 if binary_label in all_R_predicted_labels else 0
                            
            if label_selection_method == 'dcg':
                all_predicted_labels = select_labels_by_dcg(ys, min_diff=dcg_min_diff)
            elif label_selection_method == 'threshold':
                all_predicted_labels = select_labels_by_threshold(ys, threshold=threshold)
            elif label_selection_method == 'k':
                all_predicted_labels = select_labels_by_k(ys, k=k)
            binary_predicted_label = 1 if binary_label in all_predicted_labels else 0
            
            true_labels_binary.append(binary_true_label)
            R_predicted_labels_binary.append(binary_R_predicted_label)
            predicted_labels_binary.append(binary_predicted_label)
        
        metrics[binary_label]['R_precision'] = precision_score(true_labels_binary, R_predicted_labels_binary)
        metrics[binary_label]['precision'] = precision_score(true_labels_binary, predicted_labels_binary)
        metrics[binary_label]['recall'] = recall_score(true_labels_binary, predicted_labels_binary)
        metrics[binary_label]['fscore'] = f1_score(true_labels_binary, predicted_labels_binary)
        metrics[binary_label]['true_frequency'] = len([x for x in true_labels_binary if x == 1]) / len(true_labels_binary)
        metrics[binary_label]['R_predicted_frequency'] = len([x for x in R_predicted_labels_binary if x == 1]) / len(true_labels_binary)
        metrics[binary_label]['predicted_frequency'] = len([x for x in predicted_labels_binary if x == 1]) / len(true_labels_binary)
        
    avg_R_precision = 0
    avg_precision = 0
    avg_recall = 0
    avg_fscore = 0
    total_true_frequency = 0
    for l in metrics:
        avg_R_precision += (metrics[l]['R_precision'] * metrics[l]['true_frequency'])
        avg_precision += (metrics[l]['precision'] * metrics[l]['true_frequency'])
        avg_recall += (metrics[l]['recall'] * metrics[l]['true_frequency'])
        avg_fscore += (metrics[l]['fscore'] * metrics[l]['true_frequency'])
        total_true_frequency += metrics[l]['true_frequency']
    
    metrics['average'] = {'R_precision': avg_R_precision / total_true_frequency,
                          'precision': avg_precision / total_true_frequency,
                          'recall': avg_recall / total_true_frequency,
                          'fscore': avg_fscore / total_true_frequency}
        
    return metrics

In [None]:
def compute_classification_metrics(y_true, y_score, label_selection_method='dcg', dcg_min_diff=0.1, threshold=0.3, k=3):
    assert label_selection_method in ('dcg', 'threshold', 'k')
    if label_selection_method == 'dcg':
        assert dcg_min_diff
    elif label_selection_method == 'threshold':
        assert threshold
    elif label_selection_method == 'k':
        assert k
    
    avg_R_precision = 0
    avg_precision = 0
    avg_recall = 0
    avg_fscore = 0
    avg_n_true_labels = 0
    avg_n_predicted_labels = 0
    
    
    R_precision_normalizer = 0
    precision_normalizer = 0
    recall_normalizer = 0
    fscore_normalizer = 0
    for yt, ys in zip(y_true, y_score):
        #print('\n\ny_true sparse:', yt)
        #print('y_score sparse:', ys)
        true_labels = [l for l,v in enumerate(yt) if v]
        avg_n_true_labels += len(true_labels)
        
        R_predicted_labels = select_labels_by_k(ys, k=len(true_labels))
        R_p = _precision_single(true_labels, R_predicted_labels)
        if not np.isnan(R_p):
            #print('\nprecision:', p)
            avg_R_precision += R_p
            R_precision_normalizer += 1
        
        if label_selection_method == 'dcg':
            predicted_labels = select_labels_by_dcg(ys, min_diff=dcg_min_diff)
        elif label_selection_method == 'threshold':
            predicted_labels = select_labels_by_threshold(ys, threshold=threshold)
        elif label_selection_method == 'k':
            predicted_labels = select_labels_by_k(ys, k=k)
            
        avg_n_predicted_labels += len(predicted_labels)

        p = _precision_single(true_labels, predicted_labels)
        if not np.isnan(p):
            avg_precision += p
            precision_normalizer += 1
        
        r = _recall_single(true_labels, predicted_labels)
        if not np.isnan(r):
            avg_recall += r
            recall_normalizer += 1
            
        if (not np.isnan(p)) and (not np.isnan(r)):
            if (p+r) > 0:
                f = (2 * p * r) / (p + r)
            else:
                f = 0.0
            avg_fscore += f
            fscore_normalizer += 1
    
    if R_precision_normalizer > 0:
        avg_R_precision /= R_precision_normalizer
    if precision_normalizer > 0:
        avg_precision /= precision_normalizer
    if recall_normalizer > 0:
        avg_recall /= recall_normalizer
    if fscore_normalizer > 0:
        avg_fscore /= fscore_normalizer
    avg_n_true_labels /= len(y_true)
    avg_n_predicted_labels /= len(y_true)
    
    metrics = {}
    metrics['avg_R_precision'] = avg_R_precision
    metrics['avg_precision'] = avg_precision
    metrics['avg_recall'] = avg_recall
    metrics['avg_fscore'] = avg_fscore
    metrics['avg_n_true_labels'] = avg_n_true_labels
    metrics['avg_n_predicted_labels'] = avg_n_predicted_labels
    
    return metrics

In [None]:
def normalized_coverage_error(y_true, y_score):
    cov_err = coverage_error(y_true, y_score)
    best_possible_cov_err = np.average([sum(yt) for yt in y_true])
    worst_possible_cov_err = len(y_true[0])
    
    normalized_cov_err = (cov_err - best_possible_cov_err) / (worst_possible_cov_err - best_possible_cov_err)
    
    return normalized_cov_err

In [None]:
"""
Computes the average exposure assigned by the ranking to the ground truth labels. Works with
one single array of y_true and y_score, in a non-aggregated way.
Follows the definition of exposure by Singh et al., 2018.
"""
def _avg_exposure_single(y_true, y_score, normalize=True):
    
    true_labels = [l for l,v in enumerate(y_true) if v]
    ranked_labels = sorted([(l,s) for l,s in enumerate(y_score)], reverse=True, key=lambda x:x[-1])

    avg_exp = 0
    for j,(l,s) in enumerate(ranked_labels):
        if l in true_labels:
            avg_exp += 1 / math.log2(2+j)
    
    if normalize:
        best_possible_exp = sum([(1 / math.log2(2+j)) for j in range(len(true_labels))])
        worst_possible_exp = sum([(1 / math.log2(2+j)) for j in range(len(ranked_labels)-1,len(ranked_labels)-len(true_labels)-1,-1)])

        avg_exp = (avg_exp - worst_possible_exp) / (best_possible_exp-worst_possible_exp)

    return avg_exp

In [None]:
def avg_exposure_aggregate(y_true, y_score):
    
    avg_exp_aggr = 0
    for yt, ys in zip(y_true, y_score):
        avg_exp_aggr += _avg_exposure_single(yt, ys)
    avg_exp_aggr /= len(y_true)
    return avg_exp_aggr

In [None]:
def compute_ranking_metrics(y_true, y_score):
    metrics = {}
    metrics['lraps'] = label_ranking_average_precision_score(y_true, y_score)
    metrics['lrl'] = label_ranking_loss(y_true, y_score)
    metrics['cov_err'] = normalized_coverage_error(y_true, y_score)
    metrics['avg_exp'] = avg_exposure_aggregate(y_true, y_score)
    return metrics

In [None]:
def print_metrics(metrics):
    
    metrics_full_names = {'lraps': 'Label Ranking Average Precision',
                          'lrl': 'Label Ranking Loss',
                          'cov_err': 'Coverage Error',
                          'avg_exp': 'Average Exposure',
                          'avg_R_precision': 'Average R-Precision',
                          'avg_precision': 'Average Precision',
                          'avg_recall': 'Average Recall',
                          'avg_fscore': 'Average F-score',
                          'avg_n_true_labels': 'Average number of true labels',
                          'avg_n_predicted_labels': 'Average number of predicted labels',
                          'R_precision': 'R-Precision',
                          'precision': 'Precision',
                          'recall': 'Recall',
                          'fscore': 'F-score',
                          'true_frequency': 'Frequency in the ground truth (%)',
                          'R_predicted_frequency': 'Frequency in the predictions @ R',
                          'predicted_frequency': 'Frequency in the predictions',
                          'R_precision_IBC': 'R-Precision Imbalance Bias Coefficient',
                          'precision_IBC': 'Precision Imbalance Bias Coefficient',
                          'recall_IBC': 'Recall Imbalance Bias Coefficient',
                          'fscore_IBC': 'F-score Imbalance Bias Coefficient',
                          'R_precision_at_null_frequency' : 'Asymptotic R-Precision At Null Frequency',
                          'precision_at_null_frequency' : 'Asymptotic Precision At Null Frequency',
                          'recall_at_null_frequency' : 'Asymptotic Recall At Null Frequency',
                          'fscore_at_null_frequency' : 'Asymptotic F-score At Null Frequency'}
    
    for m in metrics:
        print(metrics_full_names[m], ':', round(metrics[m],2))