In [5]:
%load_ext autoreload
%autoreload 2

import os
from os import path
import sys
cwd = os.getcwd().split(os.path.sep)

# point to the git repository
while cwd[-1] != "ExplanationPairSentencesTasks":
    os.chdir("..")
    cwd = os.getcwd().split(os.path.sep)
print(f">> current directory : {os.getcwd()}")

# add the root directory
sys.path.append(os.path.join(os.getcwd(), "src"))

# cache and data cache
cache_path = path.join(os.getcwd() ,'.cache')
dataset_path = path.join(cache_path, 'dataset')
log_path = path.join(cache_path, 'logs')
model_path = path.join(cache_path, 'models')
print(f">> cache path : {cache_path}")
print(f">> model path : {model_path}")
print(f">> dataset path : {dataset_path}")
print(f">> logs path : {log_path}")

# Data Modules
from src.data_module.hatexplain import CLSTokenHateXPlainDM
from src.data_module.esnli import CLSTokenESNLIDM
from src.data_module.yelp_hat import CLSTokenYelpHat50DM, CLSTokenYelpHatDM

# Model
from pur_attention_key_reg import AttitModel

# Utils
from notebooks.attention_based.utils.attention_rollout import rollout
from notebooks.attention_based.utils.attention_embeddings import dict_print

# External librairies
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.notebook import tqdm
import pandas as pd

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f">> device : {DEVICE}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
>> current directory : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks
>> cache path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache
>> model path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\models
>> dataset path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\dataset
>> logs path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\logs
>> device : cuda


# HateXplain

In [9]:
dm_kwargs = dict(cache_path=dataset_path,
                 batch_size=32,
                 num_workers=0,
                 n_data=900
                 )

dm = CLSTokenHateXPlainDM(**dm_kwargs)
dm.prepare_data()
dm.setup(stage="test")
test_dataloader = dm.test_dataloader()

model_args = dict(
        cache_path=model_path,
        mode="exp",
        vocab=dm.vocab,
        lambda_entropy=0,
        lambda_supervise=0,
        lambda_lagrange=0,
        pretrained_vectors="glove.840B.300d",
        num_layers=1,
        num_heads=1,
        d_embedding=300,
        data="hatexplain",
        num_class=dm.num_class,
        opt="adam"
)

res_array = [None, None, None, None, None]
for l in range(5):
    model_args["num_layers"] = l+1
    ckp = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "checkpoints", "best.ckpt")
    hparams = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "hparams.yaml")

    # the model
    model = AttitModel.load_from_checkpoint(ckp, hparams_file=hparams, **model_args)
    model = model.eval()

    res = rollout(model.to(DEVICE), dm)
    res_array[l] = res

proceed the cosine map: : 29it [00:00, 50.62it/s]                      


test passed : torch.Size([21056])



proceed the cosine map: : 29it [00:00, 69.74it/s]                      


test passed : torch.Size([21056])



proceed the cosine map: : 29it [00:00, 56.94it/s]                      


test passed : torch.Size([21056])



proceed the cosine map: : 29it [00:00, 48.66it/s]                      


test passed : torch.Size([21056])



proceed the cosine map: : 29it [00:00, 45.20it/s]                      

test passed : torch.Size([21056])






In [13]:
for i,res in enumerate(res_array):
    print(f"AUC of the rollout for the layer {i} : ", end = "")
    print(res["AUC - cos"])

AUC of the rollout for the layer 0 : 0.584617025163806
AUC of the rollout for the layer 1 : 0.5926617317105185
AUC of the rollout for the layer 2 : 0.5951004462501168
AUC of the rollout for the layer 3 : 0.5732684009178745
AUC of the rollout for the layer 4 : 0.5866986012662915


In [12]:
for i, res in enumerate(res_array):
    if i == 0:
        continue
    print(f"mse calculated between layer 0 and rollout layer {i} : ", end="")
    print(np.mean((res["rollout_values"] - res_array[0]["rollout_values"])**2))

mse calculated between layer 0 and rollout layer 1 : 0.72984844
mse calculated between layer 0 and rollout layer 2 : 0.81446034
mse calculated between layer 0 and rollout layer 3 : 0.6375018
mse calculated between layer 0 and rollout layer 4 : 0.6733792


il n'y a pas de réelle différence entre nos différents modèles. Ceci vient directement du comportement de l'attention dans ce genre de réseau quand on rajoute plusieurs couches.
Ici le fait de rajouter des couches ne change absolument rien au comportement de notre modèle.