In [89]:

import numpy as np
import torch
from datasets import load_dataset
from datasets.arrow_dataset import concatenate_datasets

data_set = load_dataset("nlu_evaluation_data",split='train')
labels = data_set.features["label"].names
scenarios = list(set(map(lambda x: x.split("_")[0], labels)))


Using custom data configuration default
Reusing dataset nlu_evaluation_data (/home/sid/.cache/huggingface/datasets/nlu_evaluation_data/default/1.1.0/0416a5876d8240bd571f2bc2ad421cf6e6e88d938f8dcb5fd87b5af6033d6282)


In [90]:
from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("roberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def add_mask(example):
    example['text'] = example["text"] + "[MASK]"
    return example

def preprocess_function(examples):
    return tokenizer(examples["text"], add_special_tokens=True, truncation=True)

data_set = data_set.map(add_mask)
tokenized_data_set = data_set.map(preprocess_function, batched=True)

model = AutoModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda")
model.to(device)




Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

0ex [00:00, ?ex/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 6.00 GiB total capacity; 3.28 GiB already allocated; 0 bytes free; 3.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
label_embeddings = { lab: { } for lab in labels}
scen_embeddings = { scen: { } for scen in scenarios}

for label in labels:
    tokenized_label = tokenizer(label + '[MASK]', add_special_tokens=True)
    inp_tensor = torch.LongTensor(tokenized_label['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    label_embeddings[label]['cls'] = out[0]
    label_embeddings[label]['mask'] = out[-2]
    label_embeddings[label]['avg'] = np.mean(out[1:-2], 0)

for scen in scenarios:
    tokenized_label = tokenizer(scen + '[MASK]', add_special_tokens=True)
    inp_tensor = torch.LongTensor(tokenized_label['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    scen_embeddings[scen]['cls'] = out[0]
    scen_embeddings[scen]['mask'] = out[-2]
    scen_embeddings[scen]['avg'] = np.mean(out[1:-2], 0)
  

In [88]:

from tqdm import tqdm

label_inner_prods = {'cls': 0.0, 'mask': 0.0, 'avg': 0.0}
scen_inner_prods = {'cls': 0.0, 'mask': 0.0, 'avg': 0.0}

for ind in tqdm(range(len(tokenized_data_set))):
    inp = tokenized_data_set[ind]
    act_label = labels[inp['label']]
    act_scen = inp['scenario']
    inp_tensor = torch.LongTensor(inp['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    
    cls_emb = out[0]
    mask_emb = out[-2]
    avg_emb = np.mean(out[1:-2], 0)

    label_inner_prods['cls'] += cls_emb @ label_embeddings[act_label]['cls'] / sum([cls_emb @ label_embeddings[lab]['cls'] for lab in labels])
    label_inner_prods['mask'] += mask_emb @ label_embeddings[act_label]['mask'] / sum([mask_emb @ label_embeddings[lab]['mask'] for lab in labels])
    label_inner_prods['avg'] += avg_emb @ label_embeddings[act_label]['avg'] / sum([avg_emb @ label_embeddings[lab]['avg'] for lab in labels])

    scen_inner_prods['cls'] += cls_emb @ scen_embeddings[act_scen]['cls'] / sum([cls_emb @ scen_embeddings[scen]['cls'] for scen in scenarios])
    scen_inner_prods['mask'] += mask_emb @ scen_embeddings[act_scen]['mask'] / sum([mask_emb @ scen_embeddings[scen]['mask'] for scen in scenarios])
    scen_inner_prods['avg'] += avg_emb @ scen_embeddings[act_scen]['avg'] / sum([avg_emb @ scen_embeddings[scen]['avg'] for scen in scenarios])

for tok in label_inner_prods:
    label_inner_prods[tok] /= len(tokenized_data_set)

for tok in scen_inner_prods:
    scen_inner_prods[tok] /= len(tokenized_data_set)

  2%|▏         | 510/25715 [00:14<12:07, 34.63it/s]


KeyboardInterrupt: 

{'cls': 0.05716460349866881, 'mask': 0.05914497603459204, 'avg': 0.07149045651237348}


In [None]:
# Inner product analysis
