In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification
import os
import datasets

In [None]:
from src.models.components.bert_token_classifier import BertTokenClassifier
from src.datamodules.components.process import preprocess, tokenize_and_align_labels, postprocess
from src.datamodules.components.cora_label import label2id, LABEL_NAMES, id2label
from src.models.components.bert_tokenizer import bert_tokenizer

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [4]:
model = BertTokenClassifier()

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
model.load_state_dict(torch.load("checkpoints/epoch_008.ckpt", map_location=torch.device('cpu')), strict=False)
model.eval()

BertTokenClassifier(
  (bert_embedder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [6]:
dataset = datasets.load_dataset("myvision/cora-dataset-final", cache_dir='cache/')

Using custom data configuration myvision--cora-dataset-final-bfbff7a8d722b9c7
Reusing dataset parquet (cache/myvision___parquet/myvision--cora-dataset-final-bfbff7a8d722b9c7/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
cora = dataset["test"]
cora

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 500
})

In [8]:
tokenized_cora = cora.map(
    lambda x: tokenize_and_align_labels(x, label2id),
    batched=True,
    remove_columns=cora.column_names,
    load_from_cache_file=True,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
test_dataloader = DataLoader(
    dataset=tokenized_cora,
    batch_size=8,
    collate_fn=DataCollatorForTokenClassification(
        tokenizer=bert_tokenizer
    )
)