In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path
import sys
cwd = os.getcwd().split(os.path.sep)

# point to the git repository
while cwd[-1] != "ExplanationPairSentencesTasks":
    os.chdir("..")
    cwd = os.getcwd().split(os.path.sep)
print(f">> current directory : {os.getcwd()}")

# add the root directory
sys.path.append(os.path.join(os.getcwd(), "src"))

# cache and data cache
cache_path = path.join(os.getcwd() ,'.cache')
dataset_path = path.join(cache_path, 'dataset')
log_path = path.join(cache_path, 'logs')
model_path = path.join(cache_path, 'models')
print(f">> cache path : {cache_path}")
print(f">> model path : {model_path}")
print(f">> dataset path : {dataset_path}")
print(f">> logs path : {log_path}")

# import the different modules
from src.data_module.hatexplain import CLSTokenHateXPlainDM
from src.data_module.esnli import CLSTokenESNLIDM
from src.data_module.yelp_hat import CLSTokenYelpHat50DM, CLSTokenYelpHatDM
from pur_attention_key_reg import AttitModel
from modules import metrics
from notebooks.attention_based.utils.ckp_config import *

# external librairies
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.notebook import tqdm

from modules.metrics.geometry import cosine_sim, effective_rank

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f">> device : {DEVICE}")

>> current directory : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks
>> cache path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache
>> model path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\models
>> dataset path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\dataset
>> logs path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\logs
>> device : cuda


# HateXplain

In [45]:
%%capture
sim_k_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_v_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_emb_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}


dm_kwargs = dict(cache_path=dataset_path,
                 batch_size=32,
                 num_workers=0,
                 n_data=999
                 )

dm = CLSTokenHateXPlainDM(**dm_kwargs)

dm.prepare_data()
dm.setup(stage="test")
test_dataloader = dm.test_dataloader() # load the test dataset

spec_ids = torch.tensor(dm.vocab(["<cls>", "<pad>", "<unk>"]), device=DEVICE)

model_args = dict(
        cache_path=model_path,
        mode="exp",
        vocab=dm.vocab,
        lambda_entropy=0,
        lambda_supervise=0,
        lambda_lagrange=0,
        pretrained_vectors="glove.840B.300d",
        num_layers=1,
        num_heads=1,
        d_embedding=300,
        data="hatexplain",
        num_class=dm.num_class,
        opt="adam"
)
cpt = torch.tensor([0, 0, 0, 0, 0], device=DEVICE)
for l in range(5) :

    # update the args for the model
    model_args["num_layers"] = l+1
    ckp = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "checkpoints", "best.ckpt")
    hparams = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "hparams.yaml")

    # the model
    model = AttitModel.load_from_checkpoint(ckp, hparams_file=hparams, **model_args)
    model = model.eval()

    with torch.no_grad():
        model = model.to(DEVICE)
        pbar = tqdm(enumerate(test_dataloader), total = int(999/32))
        for id_batch, batch in pbar:

            pbar.set_description("proceed the similarity metric")
            ids = batch["token_ids"].to(DEVICE)

            # PADDING
            padding_mask = batch["padding_mask"].bool().to(DEVICE)
            buff_mask = torch.isin(ids, spec_ids)
            embedding_padding = padding_mask.clone()
            embedding_padding[buff_mask] = 1.

            # OUTPUTS
            output = model(ids=ids, mask=padding_mask)
            cl = output["logits"].argmax(dim=-1)
            cpt[l] += (cl == batch["y_true"].to(DEVICE)).sum().item()
            k, v, emb = output["key_embeddings"], output["value_embeddings"], output["hidden_states"]

            for i in range(l+1):
                # calculus of the metrics
                sim_k = cosine_sim(k[i], padding_mask, normalize="")
                sim_v = cosine_sim(v[i], padding_mask, normalize="")
                sim_e = cosine_sim(emb[i], embedding_padding, normalize="")

                # update dictionnaries
                sim_k_dict[f"n_layer={l+1}"][i] += sim_k.sum().item()
                sim_v_dict[f"n_layer={l+1}"][i] += sim_v.sum().item()
                sim_emb_dict[f"n_layer={l+1}"][i] += sim_e.sum().item()

    model = model.cpu()
    del model
    torch.cuda.empty_cache()

for k in sim_k_dict:
    sim_k_dict[k] = sim_k_dict[k] / 999
    sim_v_dict[k] = sim_v_dict[k] / 999
    sim_emb_dict[k] = sim_emb_dict[k] / 999
;

In [46]:
(cpt / 999).cpu() * 100 # the accuracy (in %)

tensor([62.4625, 62.4625, 63.6637, 63.9640, 62.7628])

In [47]:
sim_k_dict

{'n_layer=1': array([0.71326747]),
 'n_layer=2': array([0.69151901, 0.68288404]),
 'n_layer=3': array([0.69846342, 0.68784525, 0.84177555]),
 'n_layer=4': array([0.61357826, 0.61971813, 0.74860878, 0.83997919]),
 'n_layer=5': array([0.62424775, 0.64732869, 0.77722902, 0.91285675, 0.97317868])}

In [48]:
sim_v_dict

{'n_layer=1': array([0.54221579]),
 'n_layer=2': array([0.51039431, 0.7399584 ]),
 'n_layer=3': array([0.60546832, 0.68796405, 0.88236577]),
 'n_layer=4': array([0.59191521, 0.56141785, 0.78510146, 0.93110824]),
 'n_layer=5': array([0.60675967, 0.67302059, 0.85818866, 0.95838471, 0.9922627 ])}

In [49]:
sim_emb_dict

{'n_layer=1': array([0.30715777]),
 'n_layer=2': array([0.30740158, 0.63975486]),
 'n_layer=3': array([0.30707839, 0.65768343, 0.80064176]),
 'n_layer=4': array([0.30735863, 0.54912228, 0.71639724, 0.8791464 ]),
 'n_layer=5': array([0.30678413, 0.60209061, 0.7905752 , 0.9227977 , 0.98004375])}

# Yelp Hat

In [56]:
%%capture
sim_k_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_v_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_emb_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}

dm_kwargs = dict(cache_path=dataset_path,
                 batch_size=32,
                 num_workers=0,
                 n_data=999
                 )

dm = CLSTokenYelpHatDM(**dm_kwargs)
dm.prepare_data()
dm.setup(stage="test")
test_dataloader_yh = dm.test_dataloader()

model_args = dict(
        cache_path=model_path,
        mode="exp",
        vocab=dm.vocab,
        lambda_entropy=0,
        lambda_supervise=0,
        lambda_lagrange=0,
        pretrained_vectors="glove.840B.300d",
        num_layers=1,
        num_heads=1,
        d_embedding=300,
        data="yelphat",
        num_class=dm.num_class,
        opt="adam"
)
cpt = torch.tensor([0, 0, 0, 0, 0], device=DEVICE)

for test_dataloader in test_dataloader_yh :

    for l in range(5) :

        # update the args for the model
        model_args["num_layers"] = l+1
        ckp = os.path.join(log_path, "PurAttention", f"run=0_yelphat50_l=0{l+1}_h=1_adam", "checkpoints", "best.ckpt")
        hparams = os.path.join(log_path, "PurAttention", f"run=0_yelphat50_l=0{l+1}_h=1_adam", "hparams.yaml")

        # the model
        model = AttitModel.load_from_checkpoint(ckp, hparams_file=hparams, **model_args)
        model = model.eval()

        with torch.no_grad():
            model = model.to(DEVICE)
            pbar = tqdm(enumerate(test_dataloader), total = int(999/32))
            for id_batch, batch in pbar:

                pbar.set_description("proceed the similarity metric")
                ids = batch["token_ids"].to(DEVICE)

                # PADDING
                padding_mask = batch["padding_mask"].bool().to(DEVICE)
                buff_mask = torch.isin(ids, spec_ids)
                embedding_padding = padding_mask.clone()
                embedding_padding[buff_mask] = 1.

                # OUTPUTS
                output = model(ids=ids, mask=padding_mask)
                cl = output["logits"].argmax(dim=-1)
                cpt[l] += (cl == batch["y_true"].to(DEVICE)).sum().item()
                k, v, emb = output["key_embeddings"], output["value_embeddings"], output["hidden_states"]

                assert len(v)+1 == len(emb), "errors : (1)"
                assert len(k)+1 == len(emb), "errors : (2)"

                for i in range(l+1):
                    # calculus of the metrics
                    sim_k = cosine_sim(k[i], padding_mask, normalize="")
                    sim_v = cosine_sim(v[i], padding_mask, normalize="")
                    sim_e = cosine_sim(emb[i], embedding_padding, normalize="")

                    # update dictionnaries
                    sim_k_dict[f"n_layer={l+1}"][i] += sim_k.sum().item()
                    sim_v_dict[f"n_layer={l+1}"][i] += sim_v.sum().item()
                    sim_emb_dict[f"n_layer={l+1}"][i] += sim_e.sum().item()

        model = model.cpu()
        del model
        torch.cuda.empty_cache()

nb_samples = sum([len(test_dataloader_yh[i].dataset) for i in range(3)])

for k in sim_k_dict:
    sim_k_dict[k] = sim_k_dict[k] / nb_samples
    sim_v_dict[k] = sim_v_dict[k] / nb_samples
    sim_emb_dict[k] = sim_emb_dict[k] / nb_samples

In [57]:
(cpt/nb_samples).cpu()

tensor([0.8702, 0.8638, 0.8676, 0.8560, 0.8817])

In [58]:
sim_k_dict

{'n_layer=1': array([0.57841434]),
 'n_layer=2': array([0.64860994, 0.55575706]),
 'n_layer=3': array([0.59687076, 0.43072539, 0.5372692 ]),
 'n_layer=4': array([0.71363631, 0.71653453, 0.70200274, 0.85986021]),
 'n_layer=5': array([0.58451316, 0.54224598, 0.76112572, 0.81632189, 0.95989591])}

In [59]:
sim_v_dict

{'n_layer=1': array([0.3719648]),
 'n_layer=2': array([0.40870349, 0.51055599]),
 'n_layer=3': array([0.46146833, 0.37069772, 0.49411083]),
 'n_layer=4': array([0.42850584, 0.61289558, 0.80302721, 0.90383451]),
 'n_layer=5': array([0.41716203, 0.62437506, 0.77994665, 0.90231054, 0.97221654])}

In [60]:
sim_emb_dict

{'n_layer=1': array([0.38078402]),
 'n_layer=2': array([0.37996527, 0.50289467]),
 'n_layer=3': array([0.38148685, 0.38971275, 0.44309147]),
 'n_layer=4': array([0.3792807 , 0.5504015 , 0.74507838, 0.85944818]),
 'n_layer=5': array([0.37956235, 0.54823069, 0.72569706, 0.86386779, 0.95190411])}

# E-SNLI

In [6]:
dm_kwargs = dict(cache_path=dataset_path,
                 batch_size=32,
                 num_workers=0,
                 n_data=999
                 )

dm = CLSTokenESNLIDM(**dm_kwargs)
dm.prepare_data()
dm.setup(stage="test")
test_dataloader = dm.test_dataloader()

spec_ids = torch.tensor(dm.vocab(["<cls>", "<pad>", "<unk>"]), device=DEVICE)

In [12]:
%%capture
sim_k_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_v_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_emb_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}

model_args = dict(
        cache_path=model_path,
        mode="exp",
        vocab=dm.vocab,
        lambda_entropy=0,
        lambda_supervise=0,
        lambda_lagrange=0,
        pretrained_vectors="glove.840B.300d",
        num_layers=1,
        num_heads=1,
        d_embedding=300,
        data="esnli",
        num_class=dm.num_class,
        opt="adam"
)
cpt = torch.tensor([0, 0, 0, 0, 0], device=DEVICE)
for l in range(2) :

    # update the args for the model
    model_args["num_layers"] = l+1
    ckp = os.path.join(log_path, "PurAttention", f"run=0_esnli_l=0{l+1}_h=1_adam", "checkpoints", "best.ckpt")
    hparams = os.path.join(log_path, "PurAttention", f"run=0_esnli_l=0{l+1}_h=1_adam", "hparams.yaml")

    # the model
    model = AttitModel.load_from_checkpoint(ckp, hparams_file=hparams, **model_args)
    model = model.eval()

    with torch.no_grad():
        model = model.to(DEVICE)
        pbar = tqdm(enumerate(test_dataloader), total = int(999/32))
        for id_batch, batch in pbar:

            pbar.set_description("proceed the similarity metric")
            ids = batch["token_ids"].to(DEVICE)

            # padding
            padding_mask = batch["padding_mask"].bool().to(DEVICE)
            buff_mask = torch.isin(ids, spec_ids)
            embedding_padding = padding_mask.clone()
            embedding_padding[buff_mask] = 1.

            output = model(ids=ids, mask=padding_mask)

            # check accuracy
            cl = output["logits"].argmax(dim=-1)
            cpt[l] += (cl == batch["y_true"].to(DEVICE)).sum().item()

            # get the embeddings
            k, v, emb = output["key_embeddings"], output["value_embeddings"], output["hidden_states"]

            for i in range(l+1):
                # calculus of the metrics
                sim_k = cosine_sim(k[i], padding_mask, normalize="")
                sim_v = cosine_sim(v[i], padding_mask, normalize="")
                sim_e = cosine_sim(emb[i], embedding_padding, normalize="")

                sim_k_dict[f"n_layer={l+1}"][i] += sim_k.sum().item()
                sim_v_dict[f"n_layer={l+1}"][i] += sim_v.sum().item()
                sim_emb_dict[f"n_layer={l+1}"][i] += sim_e.sum().item()

    model = model.cpu()
    del model
    torch.cuda.empty_cache()

nb_samples = len(test_dataloader.dataset)

for k in sim_k_dict:
    sim_k_dict[k] = sim_k_dict[k] / nb_samples
    sim_v_dict[k] = sim_v_dict[k] / nb_samples
    sim_emb_dict[k] = sim_emb_dict[k] / nb_samples

In [13]:
(cpt.cpu() / 999)

tensor([0.6276, 0.7297, 0.0000, 0.0000, 0.0000])

In [14]:
sim_k_dict

{'n_layer=1': array([0.65682015]),
 'n_layer=2': array([0.71875791, 0.48821352]),
 'n_layer=3': array([0., 0., 0.]),
 'n_layer=4': array([0., 0., 0., 0.]),
 'n_layer=5': array([0., 0., 0., 0., 0.])}

In [15]:
sim_v_dict

{'n_layer=1': array([0.52405099]),
 'n_layer=2': array([0.18200549, 0.74587911]),
 'n_layer=3': array([0., 0., 0.]),
 'n_layer=4': array([0., 0., 0., 0.]),
 'n_layer=5': array([0., 0., 0., 0., 0.])}

In [16]:
sim_emb_dict

{'n_layer=1': array([0.35232755]),
 'n_layer=2': array([0.34540131, 0.59773792]),
 'n_layer=3': array([0., 0., 0.]),
 'n_layer=4': array([0., 0., 0., 0.]),
 'n_layer=5': array([0., 0., 0., 0., 0.])}