In [None]:
import datasets
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from src.eval import eval_preds
from src.TreeTransformer import TreeBertForSequenceClassification
import random

random.seed(0)

dataset = datasets.load_dataset("michaelginn/latent-trees-agreement-ID")


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 100
def tokenize_function(example):
    return tokenizer(example['text'], max_length=max_length, truncation=True)
dataset = dataset.map(tokenize_function, batched=True, load_from_cache_file=False)

toy_dataset = dataset['train'].select(range(1, 11))

id2label = {0: "VIOLATION", 1: "GRAMMATICAL"}
label2id = {"VIOLATION": 0, "GRAMMATICAL": 1}

pretrained = False
if pretrained:
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)
else:
    # Create random initialized BERT model
    config = BertConfig(num_labels=2, id2label=id2label, label2id=label2id)

model = TreeBertForSequenceClassification(config=config).to('mps')
# model = BertForSequenceClassification(config=config).to('mps')

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=1,
    save_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1000,
    load_best_model_at_end=False,
    logging_strategy='epoch',
)

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = np.argmax(eval_pred.predictions, axis=-1)
    print(eval_pred.predictions)
    return eval_preds(preds, labels)


trainer = Trainer(
    model,
    args,
    train_dataset= toy_dataset, # dataset['train'],
    eval_dataset= toy_dataset, # dataset['eval'], # dataset['test'].select(range(20)),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

# preds = trainer.predict(dataset['test'].select(range(20)))
# preds

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mmichael-ginn[0m. Use [1m`wandb login --relogin`[0m to force relogin


  combined_mask.append(row | b)


Epoch,Training Loss,Validation Loss


[[ 0.01094113 -0.12484915]
 [ 0.21315923 -0.12998465]
 [ 0.15693086 -0.07591655]
 [ 0.16490501 -0.23298131]
 [ 0.06891671 -0.04471554]
 [ 0.2751846  -0.17061032]
 [ 0.20609564 -0.2424886 ]
 [ 0.08487487 -0.03602662]
 [ 0.0283705  -0.17469569]
 [ 0.09366967 -0.19030924]]
PREDS [0 0 0 0 0 0 0 0 0 0]
LABELS [1 1 0 0 0 0 1 1 0 1]
[[ 0.0109398  -0.1248479 ]
 [ 0.21315767 -0.1299835 ]
 [ 0.15692893 -0.07591483]
 [ 0.16490185 -0.23297976]
 [ 0.0689157  -0.0447139 ]
 [ 0.2751847  -0.17060816]
 [ 0.20609389 -0.24248737]
 [ 0.08487335 -0.03602535]
 [ 0.02836936 -0.1746941 ]
 [ 0.09366787 -0.19030844]]
PREDS [0 0 0 0 0 0 0 0 0 0]
LABELS [1 1 0 0 0 0 1 1 0 1]
[[ 0.00975785 -0.12408213]
 [ 0.21203525 -0.12938878]
 [ 0.15614255 -0.07564038]
 [ 0.1632679  -0.23266731]
 [ 0.0673201  -0.04394155]
 [ 0.27376792 -0.17021252]
 [ 0.20517866 -0.24183692]
 [ 0.08329526 -0.03527214]
 [ 0.02729624 -0.17403658]
 [ 0.0917088  -0.18980551]]
PREDS [0 0 0 0 0 0 0 0 0 0]
LABELS [1 1 0 0 0 0 1 1 0 1]
[[ 0.00783419 -0

In [1]:
from transformers import BertConfig
from src.TreeTransformer import TreeBertForSequenceClassification
import torch


config = BertConfig(num_labels=2)
model = TreeBertForSequenceClassification(config=config)
model(input_ids=torch.tensor([[1, 2, 3, 0], [2, 1, 0, 1]]), attention_mask=torch.tensor([[1, 1, 1, 0], [1, 1, 0, 0]]), return_dict=True)

hidden torch.Size([2, 4, 768])
a tensor([[0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0]], dtype=torch.int32)
b tensor([[1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1]], dtype=torch.int32)
c tensor([[0, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0]], dtype=torch.int32)
attention mask torch.Size([2, 4])
a+c torch.Size([4, 4])
mask torch.Size([2, 4, 4])
scores torch.Size([2, 4, 4])
scores masked torch.Size([2, 4, 4])
query torch.Size([2, 12, 4, 64])
combined tensor([[[1, 1, 1, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]],

        [[1, 1, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 0, 1]]])
combined unsqueezed tensor([[[[1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 1]],

         [[1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 0],
          [1, 1, 1, 1]],

         [[1, 1, 1, 0],
          [1, 1, 1, 

SequenceClassifierOutputWithConstituentAttention(loss=None, logits=tensor([[-0.0112, -0.4408],
        [-0.0646, -0.1347]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [3]:
a = torch.IntTensor([[1, 1, 1, 0]])
mask = torch.IntTensor([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])

results = []
for row in a:
    results.append(row & mask)
result = torch.stack(results)
result.size()

torch.Size([1, 3, 4])

In [9]:
import numpy as np

a = torch.from_numpy(np.diag(np.ones(4 - 1, dtype=np.int32), 1))
a

tensor([[0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0]], dtype=torch.int32)

In [9]:
toy_dataset[2]

{'text': 'these ducks and these wugs kiss the ducks that laughs',
 'labels': 0,
 'input_ids': [101,
  2122,
  14875,
  1998,
  2122,
  8814,
  5620,
  3610,
  1996,
  14875,
  2008,
  11680,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}