In [None]:
!pip install seqeval
!pip install --upgrade torchvision 

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [3]:
import torch
from torch import nn
import json
from flair.models import SequenceTagger
from flair.data import Sentence
import pickle
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
import numpy as np
from datasets import load_metric

In [4]:
tag2id = {
    'unk': 0,
    'O': 1,
    'B-OBJECT': 2,
    'I-OBJECT': 3,
    'B-PEOPLE': 4,
    'I-PEOPLE': 5,
    'B-LOCATION': 6,
    'B-BRAND': 7,
    'I-BRAND': 8,
    'I-LOCATION': 9,
    'START': 10,
    'STOP': 11,
}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(tag2id.keys())

In [5]:
with open('roberta_tokenized_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [6]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [5]:
inp2tokens = {}

In [6]:
for data in dataset['train']:
    inp2tokens[json.dumps(data['input_ids'])] = Sentence(' '.join(data['tokens']))

In [7]:
for data in dataset['test']:
    inp2tokens[json.dumps(data['input_ids'])] = Sentence(' '.join(data['tokens']))

In [8]:
for data in dataset['validation']:
    inp2tokens[json.dumps(data['input_ids'])] = Sentence(' '.join(data['tokens']))

In [9]:
with open('inp2tokens.pkl', 'wb') as f:
    pickle.dump(inp2tokens, f)

In [16]:
with open('inp2tokens.pkl', 'rb') as f:
    inp2tokens = pickle.load(f)

In [9]:
model = AutoModelForTokenClassification.from_pretrained('distilroberta-base', num_labels=num_labels)
model.config.id2label = id2tag
model.config.label2id = tag2id
model.train()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream tas

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [18]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p    
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [17]:
teacher_model = SequenceTagger.load('prod-model.pt')
teacher_model.eval()

2021-07-19 09:26:15,322 loading file prod-model.pt


SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('crawl')
    (list_embedding_1): WordEmbeddings('twitter')
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (word_dropout): WordDropout(p=0.05)
  (embedding2nn): Linear(in_features=1424, out_features=1424, bias=True)
  (rnn): LSTM(1424, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=1024, out_features=12, bias=True)
)

In [20]:
alpha_tc = 1.0
alpha_ce = 2.0
temperature = 2.0

class DistilTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        s_logits = outputs.logits
        
        mask = (labels > -1).unsqueeze(-1).expand_as(s_logits)
        s_logits_slct = torch.masked_select(s_logits, mask)
        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))
        
        tc_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        loss_tc = tc_loss_fct(s_logits.view(-1, s_logits.size(-1)), labels.view(-1))
        loss = alpha_tc * loss_tc
        
        if alpha_ce > 0:
            try:
                input_lists = inputs['input_ids'].tolist()
                sentences = []
                for input_list in input_lists:
                    sentence = inp2tokens[json.dumps(input_list)]
                    if len(sentence) > 0:
                        sentences.append(sentence)
                
                with torch.no_grad():
                    t_logits = teacher_model.forward(sentences)
                    
                for sentence in sentences:
                    for token in sentence:
                        token.clear_embeddings()
                
                x_ind = torch.sum((labels > -1), dim=1)
                max_x = x_ind.max()
                t_logits = t_logits[:,:max_x,:]
                
                if t_logits.shape[1] < max_x:
                    t_logits_pad = torch.ones((t_logits.shape[0], max_x, t_logits.shape[2]))
                    t_logits_pad[:, :t_logits.shape[1], :] = t_logits
                    t_logits = t_logits_pad
                    t_logits = t_logits.to(torch.device(self.args.device))
                    
                mask_x = x_ind < max_x
                y_ind = torch.arange(x_ind.shape[0], device = torch.device(self.args.device))
                x_ind = torch.masked_select(x_ind, mask_x)
                y_ind = torch.masked_select(y_ind, mask_x)
                mask_t = torch.zeros(t_logits.shape[:-1])
                mask_t[( y_ind, x_ind )] = True
                mask_t = 1 - mask_t.cumsum(dim=-1)
                mask_t = (mask_t > 0).unsqueeze(-1).expand_as(t_logits)
                mask_t = mask_t.to(torch.device(self.args.device))
                
                t_logits_slct = torch.masked_select(t_logits, mask_t )
                t_logits_slct = t_logits_slct.view(-1, t_logits.size(-1))        
            
                ce_loss_fct = nn.KLDivLoss(reduction="batchmean")        
                loss_ce = (
                    ce_loss_fct(
                        nn.functional.log_softmax(s_logits_slct / temperature, dim=-1),
                        nn.functional.softmax(t_logits_slct / temperature, dim=-1),
                    )
                    * (temperature) ** 2
                )
                
                loss += alpha_ce * loss_ce
                torch.cuda.empty_cache()
            except:
                print("Error Calculating Distillation Loss!")
                pass
            
        return (loss, outputs) if return_outputs else loss

In [7]:
training_args = TrainingArguments(
    output_dir='./results_distill',          # output directory
    num_train_epochs=100,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-05, 
    weight_decay=0.01,               # strength of weight decay
    
    evaluation_strategy = "steps",
    logging_dir = './logs_distill',
    logging_steps = 1000,
    save_steps = 1000,
    
    do_train=True,
    do_eval=True,
)

trainer = DistilTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset['train'],         # training dataset
    eval_dataset=dataset['validation'].select(list(range(200))),             # evaluation dataset
    compute_metrics=compute_metrics,
)

NameError: name 'compute_metrics' is not defined

In [28]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, tags.
***** Running training *****
  Num examples = 184226
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1151500


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.5801,0.770834,0.501718,0.564797,0.531392,0.925833
2000,0.55,0.743665,0.485666,0.55706,0.518919,0.923454
3000,0.6576,0.742386,0.479233,0.580271,0.524934,0.921914
4000,0.6595,0.655835,0.513369,0.55706,0.534323,0.925133


Saving model checkpoint to ./results_distill/checkpoint-500
Configuration saved in ./results_distill/checkpoint-500/config.json
Model weights saved in ./results_distill/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, tags.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to ./results_distill/checkpoint-1000
Configuration saved in ./results_distill/checkpoint-1000/config.json
Model weights saved in ./results_distill/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results_distill/checkpoint-1500
Configuration saved in ./results_distill/checkpoint-1500/config.json
Model weights saved in ./results_distill/checkpoint-1500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignor

Error Calculating Distillation Loss!


Saving model checkpoint to ./results_distill/checkpoint-2500
Configuration saved in ./results_distill/checkpoint-2500/config.json
Model weights saved in ./results_distill/checkpoint-2500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags, id, tags.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32
Saving model checkpoint to ./results_distill/checkpoint-3000
Configuration saved in ./results_distill/checkpoint-3000/config.json
Model weights saved in ./results_distill/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ./results_distill/checkpoint-3500
Configuration saved in ./results_distill/checkpoint-3500/config.json
Model weights saved in ./results_distill/checkpoint-3500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ig

Error Calculating Distillation Loss!


KeyboardInterrupt: 

In [11]:
model = AutoModelForTokenClassification.from_pretrained('results_final_distill/checkpoint-366000', num_labels=num_labels)
model.config.id2label = id2tag
model.config.label2id = tag2id
model.eval()

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [22]:
training_args = TrainingArguments(
    output_dir='./results_distill',          # output directory
    num_train_epochs=100,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-05, 
    weight_decay=0.01,               # strength of weight decay
    
    evaluation_strategy = "steps",
    logging_dir = './logs_distill',
    logging_steps = 1000,
    save_steps = 1000,
    
    do_train=True,
    do_eval=True,
)

trainer = DistilTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset['train'],         # training dataset
    eval_dataset=dataset['validation'],             # evaluation dataset
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, tags, ner_tags, id.
***** Running Evaluation *****
  Num examples = 23053
  Batch size = 32


{'eval_loss': 0.44542521238327026,
 'eval_precision': 0.5129956657736384,
 'eval_recall': 0.6290830280200217,
 'eval_f1': 0.5651394576218807,
 'eval_accuracy': 0.9313081743762978,
 'eval_runtime': 421.9188,
 'eval_samples_per_second': 54.638,
 'eval_steps_per_second': 1.709}

In [24]:
trainer.evaluate(dataset['test'])

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, tags, ner_tags, id.
***** Running Evaluation *****
  Num examples = 23052
  Batch size = 32


{'eval_loss': 0.4468543231487274,
 'eval_precision': 0.5157666783094655,
 'eval_recall': 0.6304715386060492,
 'eval_f1': 0.5673798106479773,
 'eval_accuracy': 0.9312075248388202,
 'eval_runtime': 417.2404,
 'eval_samples_per_second': 55.249,
 'eval_steps_per_second': 1.728}

In [25]:
trainer.evaluate(dataset['train'])

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, tags, ner_tags, id.
***** Running Evaluation *****
  Num examples = 184226
  Batch size = 32


Error Calculating Distillation Loss!
Error Calculating Distillation Loss!


RuntimeError: CUDA out of memory. Tried to allocate 1.84 GiB (GPU 0; 23.65 GiB total capacity; 2.96 GiB already allocated; 12.88 GiB free; 4.07 GiB reserved in total by PyTorch)

In [10]:
dataset['test']['input_ids']

tensor([[    0,  4557, 11785,  ...,     1,     1,     1],
        [    0,    36, 18960,  ...,     1,     1,     1],
        [    0,    22,    85,  ...,     1,     1,     1],
        ...,
        [    0,   152,   186,  ...,    28,   303,     2],
        [    0,    83,  1307,  ...,     1,     1,     1],
        [    0,  4820,    40,  ...,     1,     1,     1]])

In [14]:
i = 200
test_data = dataset['test']
import time
start = time.time()
while i < test_data.num_rows:
    ids = test_data['input_ids'][i-200:i]
    am = test_data['attention_mask'][i-200:i]
    labs = test_data['labels'][i-200:i]
    model.forward(input_ids=ids, attention_mask=am, labels=labs)
    print("Computed ",i)
    print(time.time() - start)
    i+=200

Computed  200
17.579638242721558
Computed  400
34.85356044769287
Computed  600
52.62805390357971
Computed  800
70.55653929710388


KeyboardInterrupt: 

In [18]:
i = 200
test_data = dataset['test']
import time
start = time.time()
while i < test_data.num_rows:
    ids = test_data['input_ids'][i-200:i]
    am = test_data['attention_mask'][i-200:i]
    labs = test_data['labels'][i-200:i]
    
    input_lists = ids.tolist()
    
    sentences = []
    for input_list in input_lists:
        sentence = inp2tokens[json.dumps(input_list)]
        if len(sentence) > 0:
            sentences.append(sentence)
    
    with torch.no_grad():
        t_logits = teacher_model.forward(sentences)
    print("Computed ",i)
    print(time.time() - start)
    i+=200

Computed  200
1.0207455158233643
Computed  400
1.68013596534729
Computed  600
2.3346922397613525
Computed  800
3.1046125888824463
Computed  1000
3.8381712436676025
Computed  1200
4.429185152053833
Computed  1400
5.121250152587891
Computed  1600
5.821840047836304
Computed  1800
6.457396745681763
Computed  2000
7.074596405029297
Computed  2200
7.726131439208984
Computed  2400
8.423996925354004
Computed  2600
9.0887770652771
Computed  2800
9.808183670043945


KeyboardInterrupt: 