In [2]:
%run main.py
%load_ext autoreload
%autoreload 2

!mkdir -p {DATA_DIR} {RUBER_DIR} {MODEL_DIR}
s3 = S3()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# data

In [3]:
if not exists(NE5):
    s3.download(S3_NE5, NE5)
    s3.download(S3_BSNLP, BSNLP)
    s3.download(S3_FACTRU, FACTRU)

In [None]:
if not exists(RUBERT_VOCAB):
    s3.download(S3_RUBERT_VOCAB, RUBERT_VOCAB)
    s3.download(S3_RUBERT_EMB, RUBERT_EMB)
    s3.download(S3_RUBERT_ENCODER, RUBERT_ENCODER)
    s3.download(S3_RUBERT_NER, RUBERT_NER)

In [None]:
items = list(load_lines(RUBERT_VOCAB))
words_vocab = BERTVocab(items)
tags_vocab = BIOTagsVocab([PER, LOC, ORG])

In [None]:
device = CUDA0

In [None]:
config = BERTConfig(
    vocab_size=50106,
    seq_len=512,
    emb_dim=768,
    layers_num=12,
    heads_num=12,
    hidden_dim=3072,
    dropout=0.1,
    norm_eps=1e-12
)
emb = BERTEmbedding(
    config.vocab_size, config.seq_len, config.emb_dim,
    config.dropout, config.norm_eps
)
emb.position.requires_grad = False  # fix pos emb to train on short seqs
encoder = BERTEncoder(
    config.layers_num, config.emb_dim, config.heads_num, config.hidden_dim,
    config.dropout, config.norm_eps
)
ner = BERTNERHead(config.emb_dim, config.vocab_size)
model = BERTNER(emb, encoder, ner)

load_model(model.emb, RUBERT_EMB)
load_model(model.encoder, RUBERT_ENCODER)
load_model(model.ner, RUBERT_NER)
model = model.to(device)

criterion = CRF(len(tags_vocab))
criterion = criterion.to(device)

In [None]:
torch.manual_seed(1)
seed(1)

In [None]:
encode = BERTNEREncoder(
    words_vocab, tags_vocab,
    seq_len=128,
    batch_size=32,
    shuffle_size=10000
)

lines = (
    line
    for path in [NE5, BSNLP, FACTRU]
    for line in load_lines(path)
)
items = parse_jl(lines)
markups = from_jsons(items, SpanMarkup)
batches = list(encode(markups))

In [None]:
board = Board(BOARD_NAME, RUNS_DIR)
train_board = board.section(TRAIN_BOARD)
test_board = board.section(TEST_BOARD)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.999)

In [None]:
meters = {
    TRAIN: NERScoreMeter(),
    TEST: NERScoreMeter(),
    VALID: NERScoreMeter()
}

for epoch in log_progress(range(5)):
    model.train()
    for batch in log_progress(batches[TRAIN], leave=False):
        optimizer.zero_grad()
        batch = process_batch(model, criterion, batch)
        batch.loss.backward()
        optimizer.step()
    
        score = BatchScore(batch.loss)
        meters[TRAIN].add(score)

    meters[TRAIN].write(boards[TRAIN])
    meters[TRAIN].reset()

    model.eval()
    with torch.no_grad():
        for name in [TEST, VALID]:
            for batch in log_progress(batches[name], leave=False, desc=name):
                batch = process_batch(model, criterion, batch)
                batch.target = split_masked(batch.target.value, batch.target.mask)
                batch.pred = criterion.decode(batch.pred.value, batch.pred.mask)
                score = score_batch(batch, tags_vocab)
                meters[name].add(score)

            meters[name].write(boards[name])
            meters[name].reset()
    
    scheduler.step()
    board.step()