In [1]:
%load_ext autoreload
%autoreload 2
import sys; sys.path.insert(0, "..")

# Building model components

* In this section we shall look at and discuss various components of our pipeline

# Dataloader

* This bad boy is used to convert our data into inputs suitable for the model
* Details and comments can be found in "core/dataset.py" file
* Feel free to play around and see what's up

In [2]:
import joblib

from core.config import config
from core.dataset import NCBIDataset
x, y = joblib.load("../data/k-fold/fold1/train.bin")
ds = NCBIDataset(x,y)

In [3]:
def foo(item):
    sample_x = x[item]; sample_y = y[item]; out = ds[item] 
    print(" ".join(sample_x))
    assert all(i.shape[0]== config["tokenizer"]["MAX_LEN"] for i in out.values())
    print("----"); print(out)
foo(420)         

Normal feline beta - glucuronidase cDNA was cloned and characterized , and amplified from affected cat fibroblasts by reverse transcription coupled polymerase chain reaction .
----
{'input_ids': tensor([  101, 14508,   175, 24247,  1162, 11933,   118,   176,  7535, 10182,
         3484, 23358,  1162,   172,  2137, 11185,  1108, 22121,  1181,  1105,
         6858,   117,  1105,  1821, 18580,  1121,  4634,  5855, 20497, 12725,
        27184,  1116,  1118,  7936, 15416, 11646, 21176,  6530,  4129,  3943,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,    

# Building the model
* for our model we shall pick distbert, a variation of bert that was created using [distilation](https://en.wikipedia.org/wiki/Knowledge_distillation)
* Distil bert is much faster to [train and has greater inference speed](https://arxiv.org/pdf/1910.01108.pdf), so it's a good option as our baseline transfer model, or if you are plan to make this puppy work in production and you/your company does not have the hardware capabilities of Google, kek.
* We are going to attach a simple linear layer on top of bert and use it to get class logits for each token in our input sequence.
* Additional information can be found in "core/model.py" and "core/active_loss.py"

In [4]:
from core.model import NER 

In [12]:
model = NER(3)
ds = NCBIDataset(x,y, loss_plus = True)
tmp = ds[410]
_ , loss = model({k :v.unsqueeze(0) for k, v in tmp.items()})
print(_.shape)
print(loss.item())

torch.Size([1, 180, 3])
1.1588736772537231


# Building the training script

* The details and comments regarding the training procedure can be found in "core/train.py" and "core/train_helpers.py". Additional info can be found in config
* Let's load and train to very reproducible behaviour
* <b>NOTE:</b> depending on your platform the output provided here may change, but the following  cells should produce similar and ideally identical output

In [9]:
from core.train import train_ner

In [10]:
best_loss = train_ner("../data/k-fold", model_path="../data/model.bin")

100%|██████████| 409/409 [00:38<00:00, 10.76it/s]
100%|██████████| 23/23 [00:01<00:00, 17.60it/s]


EPOCH: 1 | TRAIN LOSS: 0.1487 | VAL LOSS: 0.0686


100%|██████████| 409/409 [00:38<00:00, 10.73it/s]
100%|██████████| 23/23 [00:01<00:00, 17.69it/s]
  0%|          | 0/409 [00:00<?, ?it/s]

EPOCH: 2 | TRAIN LOSS: 0.0465 | VAL LOSS: 0.0691


100%|██████████| 409/409 [00:38<00:00, 10.71it/s]
100%|██████████| 23/23 [00:01<00:00, 16.96it/s]
  0%|          | 0/409 [00:00<?, ?it/s]

EPOCH: 3 | TRAIN LOSS: 0.0222 | VAL LOSS: 0.0747


100%|██████████| 409/409 [00:38<00:00, 10.53it/s]
100%|██████████| 23/23 [00:01<00:00, 17.37it/s]

EPOCH: 4 | TRAIN LOSS: 0.0139 | VAL LOSS: 0.0761
stopping early, val loss didn't improve for 3 epochs





In [11]:
best_loss = train_ner("../data/k-fold", model_path="../data/model.bin")

100%|██████████| 409/409 [00:39<00:00, 10.24it/s]
100%|██████████| 23/23 [00:01<00:00, 16.71it/s]


EPOCH: 1 | TRAIN LOSS: 0.1487 | VAL LOSS: 0.0686


100%|██████████| 409/409 [00:40<00:00, 10.04it/s]
100%|██████████| 23/23 [00:01<00:00, 16.11it/s]
  0%|          | 0/409 [00:00<?, ?it/s]

EPOCH: 2 | TRAIN LOSS: 0.0465 | VAL LOSS: 0.0691


100%|██████████| 409/409 [00:39<00:00, 10.35it/s]
100%|██████████| 23/23 [00:01<00:00, 17.21it/s]
  0%|          | 0/409 [00:00<?, ?it/s]

EPOCH: 3 | TRAIN LOSS: 0.0222 | VAL LOSS: 0.0747


100%|██████████| 409/409 [00:38<00:00, 10.67it/s]
100%|██████████| 23/23 [00:01<00:00, 17.22it/s]

EPOCH: 4 | TRAIN LOSS: 0.0139 | VAL LOSS: 0.0761
stopping early, val loss didn't improve for 3 epochs





---