In [40]:
# preparation of the environment
%load_ext autoreload
%autoreload 2

import os
from os import path
# set the repository to the git repository
cwd = os.getcwd().split(os.path.sep)
while cwd[-1] != "stage_4_gm":
    os.chdir("..")
    cwd = os.getcwd().split(os.path.sep)
print(">> the git rep : ", end="")
print(os.getcwd())

# the folder where we will save our data
foler_name = "flow_study"
plots_folder = os.path.join(os.getcwd(), '.cache', 'plots')
graph_folder = path.join(plots_folder, foler_name)
if not path.exists(graph_folder):
    os.mkdir(graph_folder)

print(f">> the plots location : {graph_folder}")

import pandas as pd
import torch
import numpy as np
import pickle
from scipy.stats import entropy
from tqdm import tqdm
from tabulate import tabulate
from attention_algorithms.attention_metrics import attention_score

from training_bert import BertNliLight
from custom_data_set import SnliDataset
from custom_data_set import test_dir, dev_dir
from torch.utils.data import DataLoader

# --> from this environment
from attention_algorithms.raw_attention import RawAttention
from attention_algorithms.attention_metrics import normalize_attention

ckp = path.join(".cache", "logs", "igrida_trained", "0", "best.ckpt")
model = BertNliLight.load_from_checkpoint(ckp)
model = model.eval()  # make sure to have the model in eval mod before using it

# load the data >> without the neutral labels
data_set = SnliDataset(dir=test_dir,
                       nb_sentences=1000,
                       msg=False,
                       keep_neutral=False)
data_loader = DataLoader(data_set, batch_size=1000, shuffle=False)

sentences, masks, train_labels = next(iter(data_loader))

e_snli_data = pd.read_csv(os.path.join('.cache', 'raw_data', 'e_snli', 'cleaned_data', 'test.csv'), sep=",")\
    [["tok_sent", "hg_goal", "label"]]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
>> the git rep : C:\Users\loicf\Documents\IRISA\stage_4_gm\stage_4_gm
>> the plots location : C:\Users\loicf\Documents\IRISA\stage_4_gm\stage_4_gm\.cache\plots\flow_study


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, average_precision_score, auc

In [42]:
def jaccard(y, y_hat):
    num = np.dot(y,y_hat)
    den = np.sum(y) + np.sum(y_hat) - np.dot(y, y_hat)
    return num/den

In [43]:
from attention_algorithms.attention_flow import attention_flow_max
def flow_search(sentences=sentences, masks=masks, agr_type="avg", msg = False):
    y_hat = []
    y = []
    for i in tqdm(range(len(sentences))):
        sent = sentences[i, :].clone().detach()[None, :]
        mk = masks[i, :].clone().detach()[None, :]
        raw_attention_inst = RawAttention(model=model,
                                          input_ids=sent,
                                          attention_mask=mk,
                                          test_mod=False
                                          )
        raw_attention_inst.set_up_graph(agr_type=agr_type)

        # search for the right sentence in the snli dataset
        j = 0
        while j < e_snli_data.shape[0] and raw_attention_inst.tokens != eval(e_snli_data["tok_sent"][j]):
            j += 1

        try :
            # update the annotation list
            annot = eval(e_snli_data.hg_goal[j])
            y += annot

            # construction of the attention_score based on the SEP criterion
            buff = attention_flow_max(raw_attention_inst, out_layer=12)
            # transform to a list
            attention_score = list(normalize_attention(tokens=raw_attention_inst.tokens,
                                                       attention=torch.tensor(buff)).detach().numpy())
            y_hat += attention_score


        except Exception as e:
            if msg :
                print(e)
            else :
                pass
    return {"y" : y, "y_hat": y_hat}

In [44]:
def create_row(y, y_hat, metrics):
    row = []
    tr = np.linspace(0, 1, 50)
    for m in metrics :
        if m != "roc_auc_score" and m != "jaccard" and m != "average_precision_score":
            ar = [] # --> calculus of the metrics
            for t in tr:
                buff = 1 * (np.array(y_hat)>=t)
                ar.append(eval(m)(y, buff))
            row.append(auc(tr, ar))
        else:
            row.append(eval(m)(y, y_hat))

    return row

In [45]:
def create_eraser_row(y, y_hat, metrics):
    row = []
    tr = 0.5
    for m in metrics :
        if m != "roc_auc_score":
            buff = 1 * (np.array(y_hat)>=tr)
            row.append(eval(m)(y, buff))
        else:
            row.append(eval(m)(y, y_hat))

    return row

## AVG agregation of the heads

In [46]:
y, y_hat = None, None
rows = []
evaluation_metrics = ["f1_score", "precision_score", "recall_score", "roc_auc_score", "jaccard", "average_precision_score"]

# set the header for the table of statistics
h1 = ["AU_"+x+"_curve" for x in ["f1", "PR", "RC"]]

h2 = ["AUROC", "jaccard", "AUPRC"]
rows.append(h1 + h2)

dir = os.path.join(graph_folder, f"flow_criterion.pickle")

if os.path.exists(dir) and os.path.getsize(dir) != 0:
    print(">> the files already exist in the environment >> loading the files ...", end="")
    with open(dir, "rb") as f:
        d = pickle.load(f)
        y = d["y"]
        y_hat = d["y_hat"]
    print(" loading finished")
else :
    with open(dir, "wb") as f:
        print(">> the file doesn't exist >> downloading >> ", end=" ")
        d = flow_search(msg=False)
        print(" >> downloading finished !")
        y = d["y"]
        y_hat = d["y_hat"]
        pickle.dump(d, f)

rows.append(create_row(y, y_hat, evaluation_metrics))

>> the files already exist in the environment >> loading the files ... loading finished


In [47]:
print(tabulate(rows, headers="firstrow"))

  AU_f1_curve    AU_PR_curve    AU_RC_curve     AUROC    jaccard     AUPRC
-------------  -------------  -------------  --------  ---------  --------
     0.426394        0.29474       0.862332  0.667431   0.273523  0.335579


In [48]:
from openpyxl import load_workbook, Workbook

xls_dir = os.path.join(plots_folder, "dash_board.xlsx")
wb = None
if os.path.exists(xls_dir):
    wb = load_workbook(xls_dir)
else :
    wb = Workbook()

ws = None
if "flow_study" not in wb.sheetnames:
    ws = wb.create_sheet("flow_study")

ws = wb["flow_study"] # get the work sheet

ws.append(["metric values"])
for r in rows :
    ws.append(r)

wb.save(xls_dir)
wb.close()

In [49]:
y, y_hat = None, None
rows = []
evaluation_metrics = ["f1_score", "precision_score", "recall_score", "roc_auc_score", "jaccard", "average_precision_score"]

# set the header for the table of statistics
h1 = ["AU_"+x+"_curve" for x in ["f1", "PR", "RC"]]

h2 = ["AUROC", "jaccard", "AUPRC"]
rows.append(h1 + h2)

dir = os.path.join(graph_folder, f"flow_criterion.pickle")

if os.path.exists(dir) and os.path.getsize(dir) != 0:
    print(">> the files already exist in the environment >> loading the files ...", end="")
    with open(dir, "rb") as f:
        d = pickle.load(f)
        y = d["y"]
        y_hat = d["y_hat"]
    print(" loading finished")
else :
    with open(dir, "wb") as f:
        print(">> the file doesn't exist >> downloading >> ", end=" ")
        d = flow_search(msg=False)
        print(" >> downloading finished !")
        y = d["y"]
        y_hat = d["y_hat"]
        pickle.dump(d, f)

rows.append(create_eraser_row(y, y_hat, evaluation_metrics))

>> the files already exist in the environment >> loading the files ... loading finished


In [50]:
print(tabulate(rows, headers="firstrow"))

  AU_f1_curve    AU_PR_curve    AU_RC_curve     AUROC    jaccard     AUPRC
-------------  -------------  -------------  --------  ---------  --------
     0.427917        0.27237       0.997677  0.667431   0.272197  0.272299


## MAX agregation

In [51]:
y, y_hat = None, None
rows = []
evaluation_metrics = ["f1_score", "precision_score", "recall_score", "roc_auc_score", "jaccard", "average_precision_score"]

# set the header for the table of statistics
h1 = ["AU_"+x+"_curve" for x in ["f1", "PR", "RC"]]

h2 = ["AUROC", "jaccard", "AUPRC"]
rows.append(h1 + h2)

dir = os.path.join(graph_folder, f"flow_max_agreg_criterion.pickle")

if os.path.exists(dir) and os.path.getsize(dir) != 0:
    print(">> the files already exist in the environment >> loading the files ...", end="")
    with open(dir, "rb") as f:
        d = pickle.load(f)
        y = d["y"]
        y_hat = d["y_hat"]
    print(" loading finished")
else :
    with open(dir, "wb") as f:
        print(">> the file doesn't exist >> downloading >> ", end=" ")
        d = flow_search(msg=False, agr_type="max")
        print(" >> downloading finished !")
        y = d["y"]
        y_hat = d["y_hat"]
        pickle.dump(d, f)

rows.append(create_row(y, y_hat, evaluation_metrics))

>> the file doesn't exist >> downloading >>  

100%|██████████| 1000/1000 [52:39<00:00,  3.16s/it] 


 >> downloading finished !


In [53]:
print(tabulate(rows, headers="firstrow"))

  AU_f1_curve    AU_PR_curve    AU_RC_curve     AUROC    jaccard     AUPRC
-------------  -------------  -------------  --------  ---------  --------
     0.430206       0.275529       0.983398  0.618661   0.274199  0.296275


In [52]:
from openpyxl import load_workbook, Workbook

xls_dir = os.path.join(plots_folder, "dash_board.xlsx")
wb = None
if os.path.exists(xls_dir):
    wb = load_workbook(xls_dir)
else :
    wb = Workbook()

ws = None
if "flow_study" not in wb.sheetnames:
    ws = wb.create_sheet("flow_study")

ws = wb["flow_study"] # get the work sheet

ws.append(["metric values"])
for r in rows :
    ws.append(r)

wb.save(xls_dir)
wb.close()