In [10]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from tqdm.notebook import tqdm
tqdm.pandas()

import torch
import torchmetrics as tm

import warnings
warnings.filterwarnings("ignore")

In [11]:
SESSION_DIR_PATH = Path("../../session")
DATA_DIR_PATH = Path("../../data")
SESSION_DIR_PATH = Path("../../session")
TRAIN_DATASET_PATH = DATA_DIR_PATH.joinpath("jigsaw2019-train.csv")
TEST_DATASET_PATH = DATA_DIR_PATH.joinpath("jigsaw2019-test.csv")
LABEL_LIST = ['toxicity', 'obscene', 'sexual_explicit',
            'identity_attack', 'insult', 'threat']
IDENTITY_LIST = ['male', 'female', 'transgender', 'other_gender', 'heterosexual',
                'homosexual_gay_or_lesbian', 'bisexual','other_sexual_orientation',
                'christian', 'jewish', 'muslim', 'hindu','buddhist', 'atheist',
                'other_religion', 'black', 'white', 'asian', 'latino',
                'other_race_or_ethnicity', 'physical_disability',
                'intellectual_or_learning_disability',
                'psychiatric_or_mental_illness','other_disability']
SELECTED_IDENTITY_LIST = ['male', 'female', 'black', 'white', 'homosexual_gay_or_lesbian',
                    'christian', 'jewish', 'muslim', 'psychiatric_or_mental_illness']


SELECTED_SESSION = ['glove-bilstm-bce_2022-03-31T18-54-19-824060',
                    'roberta-fl_2022-03-30T11-53-46-595839',
                    'roberta-cbntrfl_2022-03-30T09-07-31-830794',
                    'roberta-pwbce_2022-03-30T15-57-44-127079',
                    'roberta-pwfl_2022-03-30T16-53-02-178977',
                    'unfreeze-glove-resnet44-bce_2022-03-31T07-05-17-083127',
                    'roberta-dbfl_2022-03-30T10-03-00-862744',
                    'roberta-large-bce_2022-03-30T12-49-24-393100'
                    ]

In [12]:
validation_dataset_namespace = lambda session_name: f"{session_name}.jigsaw2019-validation.csv"
log_namespace = lambda session_name: f"{session_name}.loguru.log"
metric_namespace = lambda session_name: f"{session_name}.metric.json"
test_prediction_namespace = lambda session_name: f"{session_name}.test.csv"
validation_prediction_namespace = lambda session_name: f"{session_name}.validation.csv"

## Metrics en général

In [13]:
# Adapted from : https://gist.github.com/aditya-5842/b2a886f42bc9988dc11c1737a349830c

from sklearn.metrics import roc_auc_score

########################################################################################
#######################     function to calculate the AUC        #######################
########################################################################################

def cal_auc(y_true, y_pred):
    "returns the auc value"
    return roc_auc_score(y_true, y_pred)

########################################################################################
#######################  function to calculate the Subgroup AUC  #######################
########################################################################################

def cal_subgroup_auc(target_df, prediction_df, subgroups, label):
    subgroup_filter = target_df[subgroups] == 1
    return cal_auc(target_df[subgroup_filter][label], prediction_df[subgroup_filter][label])

########################################################################################
#######################   function to calculate the BPSN AUC     #######################
########################################################################################

def cal_bpsn_auc(target_df, prediction_df, subgroups, label):
    """This will calculate the BPSN auc"""
    subgroup_filter = target_df[subgroups] == 1
    positive_label_filter = target_df[label] == 1
    # subset where subgroup is True and target label is 0
    pred_subgroup_negative_examples = prediction_df[subgroup_filter & ~positive_label_filter]
    target_subgroup_negative_examples = target_df[subgroup_filter & ~positive_label_filter]

    # subset where subgroup is False and target label is 1
    pred_background_positive_examples = prediction_df[~subgroup_filter & positive_label_filter]
    target_background_positive_examples = target_df[~subgroup_filter & positive_label_filter]

    # combine above two sets
    pred_bpsn_examples = pd.concat([pred_subgroup_negative_examples, pred_background_positive_examples])
    target_bpsn_examples = pd.concat([target_subgroup_negative_examples, target_background_positive_examples])

    return cal_auc(target_bpsn_examples[label], pred_bpsn_examples[label])


########################################################################################
#######################   function to calculate the BNSP AUC     #######################
########################################################################################
def cal_bnsp_auc(target_df, prediction_df, subgroups, label):
    """This will calculate the BNSP auc"""
    subgroup_filter = target_df[subgroups] == 1
    positive_label_filter = target_df[label] == 1

    # subset where subgroup is True and target label is 1
    pred_subgroup_positive_examples = prediction_df[subgroup_filter & positive_label_filter]
    target_subgroup_positive_examples = target_df[subgroup_filter & positive_label_filter]

    # subset where subgroup is False and target label is 0
    pred_background_negative_examples = prediction_df[~subgroup_filter & ~positive_label_filter]
    target_background_negative_examples = target_df[~subgroup_filter & ~positive_label_filter]

    # combine above two sets
    pred_bnsp_examples = pd.concat([pred_subgroup_positive_examples, pred_background_negative_examples])
    target_bnsp_examples = pd.concat([target_subgroup_positive_examples, target_background_negative_examples])

    return cal_auc(target_bnsp_examples[label], pred_bnsp_examples[label])

########################################################################################
#######################    function to calculate Bias metric     #######################
########################################################################################
def cal_bias_metric(target_df, prediction_df, subgroups, label):
    """Computes per-subgroup metrics for all subgroups and one model
    and returns the dataframe which will have all three Bias metrices
    and number of exmaples for each subgroup"""
    records = []
    for subgroup in subgroups:
        record = {"subgroup": subgroup, "subgroup_size": len(target_df[target_df[subgroup] == 1])}
        record["subgroup_auc"] = cal_subgroup_auc(target_df, prediction_df, subgroup, label)
        record["bpsn_auc"]     = cal_bpsn_auc(target_df, prediction_df, subgroup, label)
        record["bnsp_auc"]     = cal_bnsp_auc(target_df, prediction_df, subgroup, label)

        records.append(record)
    submetric_df = pd.DataFrame(records).sort_values("subgroup_auc", ascending = True)

    return submetric_df

########################################################################################
#######################   function to calculate Overall metric   #######################
########################################################################################
def cal_overall_auc(target_df, prediction_df, label):
    return roc_auc_score(target_df[label], prediction_df[label])

########################################################################################
#######################    function to calculate final metric    #######################
########################################################################################
def power_mean(series, p):
    total_sum = np.sum(np.power(series, p))
    return np.power(total_sum/len(series), 1/p)

def final_metric(submetric_df, overall_auc, p = -5, w = 0.25):
    generalized_subgroup_auc = power_mean(submetric_df["subgroup_auc"], p)
    generalized_bpsn_auc = power_mean(submetric_df["bpsn_auc"], p)
    generalized_bnsp_auc = power_mean(submetric_df["bnsp_auc"], p)
    
    overall_metric = w*overall_auc + w*(generalized_subgroup_auc
                                        + generalized_bpsn_auc
                                        + generalized_bnsp_auc)
    return overall_metric

def generalized_auc_metric(target_df, prediction_df, subgroups, label, p = -5):
    submetric_df = cal_bias_metric(target_df, prediction_df, subgroups, label)
    
    generalized_subgroup_auc = power_mean(submetric_df["subgroup_auc"], p)
    generalized_bpsn_auc = power_mean(submetric_df["bpsn_auc"], p)
    generalized_bnsp_auc = power_mean(submetric_df["bnsp_auc"], p)

    return dict(generalized_subgroup_auc=generalized_subgroup_auc, 
                generalized_bpsn_auc=generalized_bpsn_auc,
                generalized_bnsp_auc=generalized_bnsp_auc)

########################################################################################
#######################   function all above function into one   #######################
########################################################################################

def return_final_metric(target_df, prediction_df, subgroups, label):
    """Data is dataframe which include whole data 
    and it also has the predicted target column"""
    submetric_df = cal_bias_metric(target_df, prediction_df, subgroups, label)

    overall_auc =  cal_overall_auc(target_df, prediction_df, label)
    overall_metric = final_metric(submetric_df, overall_auc, p = -5, w = 0.25)

    return overall_metric, submetric_df

## Preparer les données

In [14]:
target_test_df = pd.read_csv(TEST_DATASET_PATH, index_col=0)
target_test_df = target_test_df[~target_test_df.white.isna()]# TO REMOVE
target_test_df[LABEL_LIST] = (target_test_df[LABEL_LIST] >= 0.5).astype(int)
target_test_df.reset_index(inplace=True)

In [15]:
identity_target_test_df = target_test_df[~target_test_df.white.isna()].copy()
identity_target_test_df[LABEL_LIST + IDENTITY_LIST] = (identity_target_test_df[LABEL_LIST + IDENTITY_LIST] >= 0.5).astype(int)
identity_target_test_df.reset_index(inplace=True)

## Fonction à appliquer pour chaque model

### Générique

In [16]:
def get_best_f1_threshold(session_name):
    # Path
    session_path = SESSION_DIR_PATH.joinpath(session_name)
    validation_target_path = session_path.joinpath(validation_dataset_namespace(session_name))
    validation_prediction_path = session_path.joinpath(validation_prediction_namespace(session_name))

    # Read files
    target_validation_df = pd.read_csv(validation_target_path, index_col=0)
    pred_validation_df = pd.read_csv(validation_prediction_path, index_col=0)

    # Clean data
    pred_validation_df = pred_validation_df[LABEL_LIST]
    target_validation_df = (target_validation_df[LABEL_LIST] >= 0.5).astype(int)

    # Create tensors
    target_tensor = torch.Tensor(target_validation_df.to_numpy()).to(int)
    prediction_tensor = torch.Tensor(pred_validation_df.to_numpy())

    thresholds = np.arange(0, 1, 0.001)
    scores = [tm.F1Score(threshold=t, average="micro")(prediction_tensor, target_tensor) for t in tqdm(thresholds, leave=False)]
    # get best threshold
    ix = np.argmax(scores)
    best_thresholds = thresholds[ix]
    return best_thresholds

### Groupe By

In [None]:
def identity_auroc_per_class_per_model(prediction_test_df):
    label = "toxicity"
    prediction_test_df = pd.concat([prediction_test_df, identity_target_test_df[IDENTITY_LIST]], axis=1)
    return cal_bias_metric(identity_target_test_df, prediction_test_df, SELECTED_IDENTITY_LIST, label)

In [17]:
def identity_auroc_per_sub_per_model(prediction_test_df):
    label = "toxicity"
    prediction_test_df = pd.concat([prediction_test_df, identity_target_test_df[IDENTITY_LIST]], axis=1)
    return cal_bias_metric(identity_target_test_df, prediction_test_df, SELECTED_IDENTITY_LIST, label)

In [18]:
def perf_and_bias_metric_per_model(prediction_test_df : pd.DataFrame):
    result_metric_dict = dict()

    # Performance
    perf_metric_dict = dict()
    ## AUROC Macro
    perf_metric_dict["auroc_macro"] = tm.AUROC(num_classes=len(LABEL_LIST), average="macro")
    ## F1 Score Macro
    perf_metric_dict["f1_macro_50"] = tm.F1Score(num_classes=len(LABEL_LIST), average="macro", threshold=0.5)
    ## F1 Score Micro
    perf_metric_dict["f1_micro_50"] = tm.F1Score(num_classes=len(LABEL_LIST), average="micro", threshold=0.5)
    ## Micro F1@BestTh
    session_name = prediction_test_df["session_name"].to_list()[0]
    best_threshold = get_best_f1_threshold(session_name)
    perf_metric_dict["best_f1_micro"] = tm.F1Score(num_classes=len(LABEL_LIST), average="micro", threshold=best_threshold)
    
    ## Recall
    perf_metric_dict["recall_micro_50"] = tm.Recall(num_classes=len(LABEL_LIST), average="micro", threshold=0.5)
    ## Precision
    perf_metric_dict["precision_micro_50"] = tm.Precision(num_classes=len(LABEL_LIST), average="micro", threshold=0.5)

    perf_metric_collection = tm.MetricCollection(perf_metric_dict)

    pred_tensor = torch.Tensor(prediction_test_df[LABEL_LIST].to_numpy())
    target_tensor = torch.Tensor(target_test_df[LABEL_LIST].to_numpy()).to(int)
    perf_result_metric_dict = perf_metric_collection(pred_tensor, target_tensor)
    perf_result_metric_dict = {key:value.item() for key, value in perf_result_metric_dict.items()}
    result_metric_dict.update(perf_result_metric_dict)

    # Biais
    label = "toxicity"
    identity_prediction_test_df = pd.concat([prediction_test_df, identity_target_test_df[IDENTITY_LIST]], axis=1)
    result_metric_dict.update(generalized_auc_metric(identity_target_test_df, identity_prediction_test_df, SELECTED_IDENTITY_LIST, label))

    return pd.Series(result_metric_dict)

## Calcul sur les sessions

In [19]:
prediction_test_df = None
for session_name in tqdm(SELECTED_SESSION):
    session_path = SESSION_DIR_PATH.joinpath(session_name)
    test_file_path =session_path.joinpath(test_prediction_namespace(session_name))
    current_test_df = pd.read_csv(test_file_path, index_col=0)
    current_test_df["session_name"] = session_name
    if prediction_test_df is None:
        prediction_test_df = current_test_df
    else:
        prediction_test_df = pd.concat([prediction_test_df, current_test_df])


  0%|          | 0/27 [00:00<?, ?it/s]

In [20]:
groupby_test_df = prediction_test_df.groupby("session_name")

In [21]:
perf_and_bias_metric_per_model_df = groupby_test_df.progress_apply(perf_and_bias_metric_per_model)
perf_and_bias_metric_per_model_df

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0_level_0,auroc_macro,best_f1_micro,f1_macro_50,f1_micro_50,precision_micro_50,recall_micro_50,generalized_subgroup_auc,generalized_bpsn_auc,generalized_bnsp_auc
session_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
albert-bce_2022-03-30T02-39-56-576796,0.866039,0.389564,0.042968,0.08247,0.773585,0.043557,0.771715,0.744676,0.846622
bert-bce_2022-03-30T03-42-55-786562,0.893154,0.414502,0.107522,0.237799,0.6785,0.144162,0.787255,0.744992,0.866263
bertweet-bce_2022-03-30T04-44-53-503295,0.865124,0.409626,0.157208,0.279955,0.559732,0.186657,0.77023,0.702815,0.841875
distilbert-bce_2022-03-30T05-39-09-629919,0.900924,0.443649,0.102599,0.178214,0.74337,0.101243,0.791111,0.721698,0.88075
fasttext-resnet44-bce_2022-03-31T01-31-40-016955,0.80974,0.385137,0.101022,0.275447,0.520366,0.187294,0.744949,0.675674,0.861909
glove-bigru-bce_2022-03-31T17-46-18-488961,0.959835,0.608612,0.420377,0.607073,0.660462,0.56167,0.867434,0.850335,0.943931
glove-bilstm-bce_2022-03-31T18-54-19-824060,0.935262,0.588833,0.25206,0.552191,0.723408,0.44651,0.858984,0.849617,0.938739
glove-cct-bce_2022-03-31T01-30-21-556938,0.789003,0.368783,0.091148,0.233652,0.618049,0.144056,0.741508,0.70536,0.824928
glove-resnet32-bce_2022-03-31T02-27-55-394312,0.841693,0.414083,0.091819,0.246135,0.657895,0.151386,0.769749,0.735319,0.835967
glove-resnet44-bce_2022-03-31T03-12-58-423737,0.849908,0.422544,0.100409,0.251289,0.707677,0.152767,0.779712,0.76649,0.834736


In [None]:
perf_and_bias_metric_per_model_df.index = perf_and_bias_metric_per_model_df.index.map(lambda session_name: session_name.split("_")[0])

In [43]:
s = perf_and_bias_metric_per_model_df.style.background_gradient(axis=0).format(precision=3).applymap_index(lambda v: "rotatebox:{45}--rwrap--latex;",axis=1)  
s

Unnamed: 0_level_0,auroc_macro,best_f1_micro,f1_macro_50,f1_micro_50,precision_micro_50,recall_micro_50,generalized_subgroup_auc,generalized_bpsn_auc,generalized_bnsp_auc
session_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
albert-bce,0.866,0.39,0.043,0.082,0.774,0.044,0.772,0.745,0.847
bert-bce,0.893,0.415,0.108,0.238,0.678,0.144,0.787,0.745,0.866
bertweet-bce,0.865,0.41,0.157,0.28,0.56,0.187,0.77,0.703,0.842
distilbert-bce,0.901,0.444,0.103,0.178,0.743,0.101,0.791,0.722,0.881
fasttext-resnet44-bce,0.81,0.385,0.101,0.275,0.52,0.187,0.745,0.676,0.862
glove-bigru-bce,0.96,0.609,0.42,0.607,0.66,0.562,0.867,0.85,0.944
glove-bilstm-bce,0.935,0.589,0.252,0.552,0.723,0.447,0.859,0.85,0.939
glove-cct-bce,0.789,0.369,0.091,0.234,0.618,0.144,0.742,0.705,0.825
glove-resnet32-bce,0.842,0.414,0.092,0.246,0.658,0.151,0.77,0.735,0.836
glove-resnet44-bce,0.85,0.423,0.1,0.251,0.708,0.153,0.78,0.766,0.835


In [44]:
print(s.to_latex(convert_css=True))

\begin{tabular}{lrrrrrrrrr}
 & \rotatebox{45}{auroc_macro} & \rotatebox{45}{best_f1_micro} & \rotatebox{45}{f1_macro_50} & \rotatebox{45}{f1_micro_50} & \rotatebox{45}{precision_micro_50} & \rotatebox{45}{recall_micro_50} & \rotatebox{45}{generalized_subgroup_auc} & \rotatebox{45}{generalized_bpsn_auc} & \rotatebox{45}{generalized_bnsp_auc} \\
session_name &  &  &  &  &  &  &  &  &  \\
albert-bce & {\cellcolor[HTML]{056DAB}} \color[HTML]{F1F1F1} 0.866 & {\cellcolor[HTML]{76AAD0}} \color[HTML]{F1F1F1} 0.390 & {\cellcolor[HTML]{F0EAF4}} \color[HTML]{000000} 0.043 & {\cellcolor[HTML]{EAE6F1}} \color[HTML]{000000} 0.082 & {\cellcolor[HTML]{034D79}} \color[HTML]{F1F1F1} 0.774 & {\cellcolor[HTML]{F7F0F7}} \color[HTML]{000000} 0.044 & {\cellcolor[HTML]{2685BB}} \color[HTML]{F1F1F1} 0.772 & {\cellcolor[HTML]{65A3CB}} \color[HTML]{F1F1F1} 0.745 & {\cellcolor[HTML]{0569A5}} \color[HTML]{F1F1F1} 0.847 \\
bert-bce & {\cellcolor[HTML]{046198}} \color[HTML]{F1F1F1} 0.893 & {\cellcolor[HTML]{5A9EC9}}