In [5]:
import torch
import numpy as np

import datasets
from tqdm.notebook import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding
)
from GlobEnc.src.modeling.modeling_roberta_saliency import RobertaForSequenceClassification

import time

2023-01-20 20:52:06.432151: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-20 20:52:06.607363: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-20 20:52:07.199494: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/modaresi/.conda/envs/globenc-venv/lib/
2023-01-20 20:52:07.199613: W tensorflow/

In [10]:
models_dir = "/home/modaresi/projects/globenc_analysis/outputs/models"
outputs_dir = "/home/modaresi/projects/globenc_analysis/outputs/saliencies_angle"
configs = {
    "mnli": {
        "model_path": lambda step: f"{models_dir}/output_mnli_bert-base-uncased_0001_SEED0042/checkpoint-{step}/",
        "output_file_path": lambda step: f"{outputs_dir}/mnli_bert-base-uncased_0001_SEED0042_checkpoint-{step}.npy",
        "hf_ds": "mnli",
    },
    "sst2": {
        "model_path": lambda step: f"{models_dir}/output_sst2_bert-base-uncased_0001_SEED0042/checkpoint-{step}/",
        "output_file_path": lambda step: f"{outputs_dir}/sst2_bert-base-uncased_0001_SEED0042_checkpoint-{step}.npy",
        "hf_ds": "sst2",
    }
}

CONFIG_NAME = "mnli"
CONFIG = configs[CONFIG_NAME]
MAX_LENGTH = 128
BATCH_SIZE = 32
SEED = 42
DATA_SECTION = "validation_matched"
# STEP = 10525
STEP = 61360

In [11]:
original_ds = datasets.load_dataset("glue", CONFIG["hf_ds"])[DATA_SECTION]
original_ds

Reusing dataset glue (/opt/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 9815
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    use_fast=True,
)

def _get_preprocessing_function(
    sentence1_key: str, 
    sentence2_key: str = None, 
    label_to_id: dict = None):

    def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=False, max_length=MAX_LENGTH, truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result
    
    return preprocess_function

preprocess_function = _get_preprocessing_function(sentence1_key="premise", sentence2_key="hypothesis")
# preprocess_function = _get_preprocessing_function(sentence1_key="sentence")
train_ds = original_ds.map(preprocess_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

In [13]:
train_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
collator = DataCollatorWithPadding(tokenizer, True, MAX_LENGTH, return_tensors="pt")
dataloader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, collate_fn=collator)
dataset_size = len(train_ds)
steps = int(np.ceil(dataset_size / BATCH_SIZE))
num_labels = len(set(original_ds['label']))

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(CONFIG["model_path"](STEP))
model.to(torch.device("cuda:0"))
model.eval()

all_sals = torch.zeros(size=(dataset_size, MAX_LENGTH)).cuda()
it = iter(dataloader)

for i in tqdm(range(steps)):
    batch = next(it)
    batch = {k: v.to(torch.device('cuda:0')) for k, v in batch.items()}
    inputs = {
        'input_ids': batch['input_ids'],
        'attention_mask': batch['attention_mask'],
        'token_type_ids': batch['token_type_ids'],
    }
    labels = batch['labels']
    output = model(**batch, output_hidden_states=True)
    
    output.hidden_states[0].retain_grad()
    logits = output.logits
    target_class_l_sum = torch.gather(logits, 1, labels.unsqueeze(-1)).sum()
    target_class_l_sum.backward()
    
    inputXgradient = output.hidden_states[0].grad * output.hidden_states[0]
    # saliencies = torch.norm(inputXgradient, dim=-1).detach()
    saliencies = torch.sum(inputXgradient, dim=-1).detach()
    
    length = saliencies.size()[1]
    model.zero_grad()
    all_sals[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :length] = saliencies

  0%|          | 0/307 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


In [15]:
all_sals = all_sals.cpu().numpy()
np.save(CONFIG["output_file_path"](STEP), all_sals)

In [1]:
models_dir = "/home/modaresi/projects/globenc_analysis/outputs/models"
outputs_dir = "/home/modaresi/projects/globenc_analysis/outputs/saliencies"

# configs = {
#     "mnli": {
#         "model_path": lambda step: f"{models_dir}/output_mnli_bert-base-uncased_0001_SEED0042/checkpoint-{step}/",
#         "output_file_path": lambda step, agg_t, bl_t: f"{outputs_dir}/mnli_bert-base-uncased_0001_SEED0042_checkpoint-{step}-{agg_t}-{bl_t}.npy",
#         "hf_ds": "mnli",
#     },
#     "sst2": {
#         "model_path": lambda step: f"{models_dir}/output_sst2_bert-base-uncased_0001_SEED0042/checkpoint-{step}/",
#         "output_file_path": lambda step, agg_t, bl_t: f"{outputs_dir}/sst2_bert-base-uncased_0001_SEED0042_checkpoint-{step}-{agg_t}-{bl_t}.npy",
#         "hf_ds": "sst2",
#     }
# }

MODEL_DATASET_SET_PARTS = [
#     (f"{models_dir}/output_cola_bert-base-uncased_0001_SEED0042/checkpoint-1340", "cola", "validation"),
#     (f"{models_dir}/output_sst2_bert-base-uncased_0001_SEED0042/checkpoint-10525", "sst2", "train", ["sentence"]),
#     (f"{models_dir}/output_sst2_bert-large-uncased_0001_SEED0042/checkpoint-10525", "sst2", "validation"),
#     (f"{models_dir}/output_mrpc_bert-base-uncased_0001_SEED0042/checkpoint-575", "mrpc", "validation"),
#     (f"{models_dir}/output_qnli_bert-base-uncased_0001_SEED0042/checkpoint-16370", "qnli", "train", ["question", "sentence"]),
#     (f"{models_dir}/output_mnli_bert-base-uncased_0001_SEED0042/checkpoint-61360", "mnli", "train", ["premise", "hypothesis"]),
#     (f"{models_dir}/output_hatexplain_bert-base-uncased_0001_SEED0042/checkpoint-2405", "hatexplain", "train", ["text"]),
    (f"WillHeld/roberta-base-sst2", "sst2", "validation", ["sentence"]),
    (f"WillHeld/roberta-base-mnli", "mnli", "validation_matched", ["premise", "hypothesis"]),
]

# CONFIG_NAME = "mnli"
# CONFIG = configs[CONFIG_NAME]
MAX_LENGTH = 128
BATCH_SIZE = 32
SEED = 42
# DATA_SECTION = "validation_matched"
AGGREGATION_TYPES = ["NORM", "SUM"]
# STEP = 10525
# STEP = 61360

In [2]:
def _get_preprocessing_function(
    sentence1_key: str, 
    sentence2_key: str = None, 
    label_to_id: dict = None):

    def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=False, max_length=MAX_LENGTH, truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result
    
    return preprocess_function

def aggregate_hatexplain(example):
    def mode(lst):
        return max(set(lst), key=lst.count)
    example["label"] = mode(example["annotators"]["label"])
    example["text"] = " ".join(example["post_tokens"])
    return example

In [8]:
def get_saliencies(dataloader, steps, model, aggregation_type=None, prediction_based=True):
    all_sals_norm_aggregated = np.zeros(shape=(dataset_size, MAX_LENGTH))
    all_sals_sum_aggregated = np.zeros(shape=(dataset_size, MAX_LENGTH))
    it = iter(dataloader)
    
    for i in tqdm(range(steps)):
        batch = next(it)
        batch = {k: v.to(torch.device('cuda:0')) for k, v in batch.items()}
        inputs = {
            'input_ids': batch['input_ids'],
            'attention_mask': batch['attention_mask'],
#             'token_type_ids': batch['token_type_ids'],
        }
        labels = batch['labels']
        output = model(**batch, output_hidden_states=True)

        output.hidden_states[0].retain_grad()
        logits = output.logits
        if prediction_based:
            target_class_l_sum = torch.gather(logits, 1, torch.argmax(logits, dim=-1).unsqueeze(-1)).sum()
        else:
            target_class_l_sum = torch.gather(logits, 1, labels.unsqueeze(-1)).sum()
        target_class_l_sum.backward()

        inputXgradient = output.hidden_states[0].grad * output.hidden_states[0]
        # saliencies = torch.norm(inputXgradient, dim=-1).detach()
#         saliencies = torch.sum(inputXgradient, dim=-1).detach()
#         if aggregation_type == "SUM":
#             saliencies = torch.sum(inputXgradient, dim=-1).detach().cpu()
#         elif aggregation_type == "NORM":
#             saliencies = torch.norm(inputXgradient, dim=-1).detach().cpu()
        saliencies_norm_aggregated = torch.norm(inputXgradient, dim=-1).detach().cpu()
        saliencies_sum_aggregated = torch.sum(inputXgradient, dim=-1).detach().cpu()

        length = saliencies_norm_aggregated.size()[1]
        model.zero_grad()
        all_sals_norm_aggregated[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :length] = saliencies_norm_aggregated
        all_sals_sum_aggregated[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :length] = saliencies_sum_aggregated
#         all_sals[i*BATCH_SIZE:(i+1)*BATCH_SIZE, :length] = saliencies
        
    return all_sals_norm_aggregated, all_sals_sum_aggregated

In [9]:
time_reports = dict()

tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=True,
)

for model_dataset_set_parts in MODEL_DATASET_SET_PARTS:
    model_checkpoint, task_name, set_of_data, sample_parts = model_dataset_set_parts
#     model = BertForSequenceClassification.from_pretrained(model_checkpoint)
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint)
    model.to(torch.device("cuda:0"))
    model.eval()
    
    if task_name == "hatexplain":
        original_ds = datasets.load_dataset(task_name)[set_of_data].map(aggregate_hatexplain)
        preprocess_function = _get_preprocessing_function(sentence1_key=sample_parts[0])
        train_ds = original_ds.map(preprocess_function, batched=True)
    else:
        original_ds = datasets.load_dataset("glue", task_name)[set_of_data]
        if len(sample_parts) == 1:
            preprocess_function = _get_preprocessing_function(sentence1_key=sample_parts[0])
        elif len(sample_parts) == 2:
            preprocess_function = _get_preprocessing_function(sentence1_key=sample_parts[0], sentence2_key=sample_parts[1])
        train_ds = original_ds.map(preprocess_function, batched=True)
    
#     train_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
    train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    collator = DataCollatorWithPadding(tokenizer, True, MAX_LENGTH, return_tensors="pt")
    dataloader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, collate_fn=collator)
    dataset_size = len(train_ds)
    steps = int(np.ceil(dataset_size / BATCH_SIZE))
    num_labels = len(set(original_ds['label']))
    
#     for agg_t in AGGREGATION_TYPES:
#         for gradient_base in ["prediction_based", "label_based"]:
    for gradient_base in ["prediction_based"]:
        t1 = time.time()
        saliencies_norm_aggregated, saliencies_sum_aggregated = get_saliencies(
            dataloader=dataloader,
            steps=steps,
            model=model,
#             aggregation_type=agg_t,
            prediction_based=(gradient_base=="prediction_based")
        )
        t2 = time.time()

        time_reports[f"{task_name}-{set_of_data}-{AGGREGATION_TYPES[0]}-{gradient_base}"] = t2 - t1

        file_name_norm_aggregated = f"[{task_name}]_[{set_of_data}]_[{model_checkpoint.split('/')[-1]}]_[IXG_{AGGREGATION_TYPES[0]}_{gradient_base}]"
        file_name_sum_aggregated = f"[{task_name}]_[{set_of_data}]_[{model_checkpoint.split('/')[-1]}]_[IXG_{AGGREGATION_TYPES[1]}_{gradient_base}]"
        
#         np.save(f"{outputs_dir}/{file_name}.npy", saliencies)
        np.save(f"{outputs_dir}/{file_name_norm_aggregated}.npy", saliencies_norm_aggregated)
        np.save(f"{outputs_dir}/{file_name_sum_aggregated}.npy", saliencies_sum_aggregated)

Using the latest cached version of the module from /opt/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Sat Sep  3 17:51:06 2022) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.
Reusing dataset glue (/opt/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Using the latest cached version of the module from /opt/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Sat Sep  3 17:51:06 2022) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.
Reusing dataset glue (/opt/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/307 [00:00<?, ?it/s]

In [None]:
print(time_reports)