In [1]:

import numpy as np
import torch
from datasets import load_dataset
from datasets.arrow_dataset import concatenate_datasets

data_set = load_dataset("nlu_evaluation_data",split='train')
labels = data_set.features["label"].names
scenarios = list(set(map(lambda x: x.split("_")[0], labels)))


Using custom data configuration default
Reusing dataset nlu_evaluation_data (/home/siddharthami_umass_edu/.cache/huggingface/datasets/nlu_evaluation_data/default/1.1.0/0416a5876d8240bd571f2bc2ad421cf6e6e88d938f8dcb5fd87b5af6033d6282)


In [6]:
from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("roberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def add_mask(example):
    example['text'] = example["text"] + "[MASK]"
    return example

def preprocess_function(examples):
    return tokenizer(examples["text"], add_special_tokens=True, truncation=True)

data_set = data_set.map(add_mask)
tokenized_data_set = data_set.map(preprocess_function, batched=True)

model = AutoModel.from_pretrained("roberta-large")
device = torch.device("cuda")
model.to(device)

# Model created

0ex [00:00, ?ex/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (d

In [3]:
label_embeddings = { lab: { } for lab in labels}
scen_embeddings = { scen: { } for scen in scenarios}

for label in labels:
    tokenized_label = tokenizer(label + '[MASK]', add_special_tokens=True)
    inp_tensor = torch.LongTensor(tokenized_label['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    label_embeddings[label]['cls'] = out[0]
    label_embeddings[label]['mask'] = out[-2]
    label_embeddings[label]['avg'] = np.mean(out[1:-2], 0)

for scen in scenarios:
    tokenized_label = tokenizer(scen + '[MASK]', add_special_tokens=True)
    inp_tensor = torch.LongTensor(tokenized_label['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    scen_embeddings[scen]['cls'] = out[0]
    scen_embeddings[scen]['mask'] = out[-2]
    scen_embeddings[scen]['avg'] = np.mean(out[1:-2], 0)
  

In [4]:

from tqdm import tqdm

label_inner_prods = {'cls': 0.0, 'mask': 0.0, 'avg': 0.0}
scen_inner_prods = {'cls': 0.0, 'mask': 0.0, 'avg': 0.0}

for ind in tqdm(range(len(tokenized_data_set))):
    inp = tokenized_data_set[ind]
    act_label = labels[inp['label']]
    act_scen = inp['scenario']
    inp_tensor = torch.LongTensor(inp['input_ids']).unsqueeze(0).to(device)
    out = model(inp_tensor)[0].squeeze(0).cpu().detach().numpy()
    
    cls_emb = out[0]
    mask_emb = out[-2]
    avg_emb = np.mean(out[1:-2], 0)

    label_inner_prods['cls'] += cls_emb @ label_embeddings[act_label]['cls'] / sum([cls_emb @ label_embeddings[lab]['cls'] for lab in labels])
    label_inner_prods['mask'] += mask_emb @ label_embeddings[act_label]['mask'] / sum([mask_emb @ label_embeddings[lab]['mask'] for lab in labels])
    label_inner_prods['avg'] += avg_emb @ label_embeddings[act_label]['avg'] / sum([avg_emb @ label_embeddings[lab]['avg'] for lab in labels])

    scen_inner_prods['cls'] += cls_emb @ scen_embeddings[act_scen]['cls'] / sum([cls_emb @ scen_embeddings[scen]['cls'] for scen in scenarios])
    scen_inner_prods['mask'] += mask_emb @ scen_embeddings[act_scen]['mask'] / sum([mask_emb @ scen_embeddings[scen]['mask'] for scen in scenarios])
    scen_inner_prods['avg'] += avg_emb @ scen_embeddings[act_scen]['avg'] / sum([avg_emb @ scen_embeddings[scen]['avg'] for scen in scenarios])

for tok in label_inner_prods:
    label_inner_prods[tok] /= len(tokenized_data_set)

for tok in scen_inner_prods:
    scen_inner_prods[tok] /= len(tokenized_data_set)

100%|██████████| 25715/25715 [09:57<00:00, 43.07it/s]


In [7]:
# Inner product analysis
print(label_inner_prods)
print(scen_inner_prods)

{'cls': 0.01470811004243378, 'mask': 0.01472479861622116, 'avg': 0.014722883776097019}
{'cls': 0.05556254027396381, 'mask': 0.05567325209404365, 'avg': 0.05560899654647954}
