# MIT Movie Dataset - Transformer Models

This notebook contains the data preparation and model development code for fine-tuning Transformer models for our NER task.

https://huggingface.co/transformers/pretrained_models.html

In [None]:
import pandas as pd
import numpy as np

# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')
#os.chdir('/content/drive/My Drive/DAAN888/data')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/DAAN888/data'

In [None]:
model_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/models/'

In [None]:
data_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/data/'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Nov 21 12:51:34 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces



---



## Load Dataset

In [None]:
import pickle 

with open('mitmovie.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

In [None]:
# first row in train set
list(zip(dataset['train_tokens'][0], dataset['train_labels'][0]))

[('what', 'O'),
 ('movies', 'O'),
 ('star', 'O'),
 ('bruce', 'B-ACTOR'),
 ('willis', 'I-ACTOR')]

## DistilBert Model

Documentation for this model can be found: 
https://huggingface.co/transformers/model_doc/distilbert.html

More model details here: https://github.com/huggingface/transformers/tree/master/examples/distillation

Following HuggingFace distilbert tutorial: https://huggingface.co/transformers/custom_datasets.html#tok-ner


In [None]:
model_name = 'distilbert-base-uncased'

In [None]:
!pip install transformers==3.5.0

Collecting transformers==3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 19.7MB/s eta 0:00:01[K     |▌                               | 20kB 27.4MB/s eta 0:00:01[K     |▊                               | 30kB 23.5MB/s eta 0:00:01[K     |█                               | 40kB 18.5MB/s eta 0:00:01[K     |█▎                              | 51kB 15.6MB/s eta 0:00:01[K     |█▌                              | 61kB 17.8MB/s eta 0:00:01[K     |█▊                              | 71kB 16.7MB/s eta 0:00:01[K     |██                              | 81kB 13.6MB/s eta 0:00:01[K     |██▎                             | 92kB 14.4MB/s eta 0:00:01[K     |██▌                             | 102kB 14.2MB/s eta 0:00:01[K     |██▊                             | 112kB 14.2MB/s eta 0:00:01[K     |███                        

In [None]:
import transformers
transformers.__version__

'3.5.0'

In [None]:
import torch
torch.__version__

'1.7.0+cu101'

### Encode Labels

In [None]:
# get the set of unique labels in the movie dataset
uniq_labels = list(set([label for doc in dataset['train_labels'] for label in doc]))

In [None]:
# assign a number to each label
label_encoding = {label: id for id, label in enumerate(uniq_labels)}
label_encoding

{'B-ACTOR': 24,
 'B-CHARACTER': 20,
 'B-DIRECTOR': 16,
 'B-GENRE': 10,
 'B-PLOT': 18,
 'B-RATING': 7,
 'B-RATINGS_AVERAGE': 6,
 'B-REVIEW': 17,
 'B-SONG': 4,
 'B-TITLE': 2,
 'B-TRAILER': 1,
 'B-YEAR': 13,
 'I-ACTOR': 19,
 'I-CHARACTER': 15,
 'I-DIRECTOR': 5,
 'I-GENRE': 3,
 'I-PLOT': 14,
 'I-RATING': 12,
 'I-RATINGS_AVERAGE': 11,
 'I-REVIEW': 0,
 'I-SONG': 23,
 'I-TITLE': 9,
 'I-TRAILER': 21,
 'I-YEAR': 8,
 'O': 22}

### Encode Texts

To encode the texts, we have to use the same Tokenizer that DistilBert was trained on.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
print('There are %s words in this tokenizer object' % tokenizer.vocab_size)

There are 30522 words in this tokenizer object


In [None]:
# use the tokenizer to encode the texts 
train_encodings = tokenizer(dataset['train_tokens'], 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

test_encodings = tokenizer(dataset['test_tokens'], 
                           is_split_into_words=True, 
                           return_offsets_mapping=True, 
                           padding=True, 
                           truncation=True)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

In [None]:
# make sure same number of docs
len(train_encodings['input_ids']), len(dataset['train_tokens'])

(9775, 9775)

In [None]:
dataset['train_tokens'][0]

['what', 'movies', 'star', 'bruce', 'willis']

In [None]:
# preview what the encoded result looks like
list(zip(train_encodings['input_ids'][0][0:12], train_encodings['attention_mask'][0][0:12], train_encodings['offset_mapping'][0][0:12]))

[(101, 1, (0, 0)),
 (2054, 1, (0, 4)),
 (5691, 1, (0, 6)),
 (2732, 1, (0, 4)),
 (5503, 1, (0, 5)),
 (12688, 1, (0, 6)),
 (102, 1, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0))]

In [None]:
# the model expects all docs to be same length (51) 
# the attention mask will tell the model to ignore the padding with zeros
print('Length of sequences is %s ' % len(train_encodings['input_ids'][0]))

Length of sequences is 51 


In [None]:
doc = 1

# first document in dataset
print( dataset['train_tokens'][doc] )

print()

# check out new tokenization result as words
print( tokenizer.convert_ids_to_tokens( train_encodings['input_ids'][doc][0:13]) )

# check out new tokenization result as ids
print( train_encodings['input_ids'][doc][0:13] )

# check out attentions
print( train_encodings['attention_mask'][doc][0:13] )

# check out offsets
print( train_encodings['offset_mapping'][doc][0:13] )

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']

['[CLS]', 'show', 'me', 'films', 'with', 'drew', 'barry', '##more', 'from', 'the', '1980s', '[SEP]', '[PAD]']
[101, 2265, 2033, 3152, 2007, 3881, 6287, 5974, 2013, 1996, 3865, 102, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[(0, 0), (0, 4), (0, 2), (0, 5), (0, 4), (0, 4), (0, 5), (5, 9), (0, 4), (0, 3), (0, 5), (0, 0), (0, 0)]


In [None]:
find_subwords = []
for i, offset_list in enumerate(train_encodings['offset_mapping']):
  for j, offset_tuple in enumerate(offset_list):
    if offset_tuple[0] != 0:
      find_subwords.append(i)

In [None]:
np.unique(find_subwords)

array([   1,    2,    3, ..., 9752, 9767, 9772])

In [None]:
dataset['train_tokens'][1]

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']

In [None]:
list(zip(train_encodings['input_ids'][1][0:12], train_encodings['attention_mask'][1][0:12], train_encodings['offset_mapping'][1][0:12]))

[(101, 1, (0, 0)),
 (2265, 1, (0, 4)),
 (2033, 1, (0, 2)),
 (3152, 1, (0, 5)),
 (2007, 1, (0, 4)),
 (3881, 1, (0, 4)),
 (6287, 1, (0, 5)),
 (5974, 1, (5, 9)),
 (2013, 1, (0, 4)),
 (1996, 1, (0, 3)),
 (3865, 1, (0, 5)),
 (102, 1, (0, 0))]

### Adjust Labels for Vocab Offset


In the print out above you can see that "barrymore" gets transformed into "barry##", "##more". This is because the name is not in the vocabulary of the model, however, those small subwords "barry##" and "##more" are. So the offsetting notifies the model of this splitting of the word. However, we have to adjust the labels now to account for this separation. 

Based on https://datascience.stackexchange.com/questions/69640/what-should-be-the-labels-for-subword-tokens-in-bert-for-ner-task we will not drag the label to the new subword feature because that would introduce more instances of that class and change the number of support instances thus making the models difficult to compare. 

Also some comments to consider for an alternative strategy: https://github.com/google-research/bert/issues/646


In [None]:
import numpy as np

def adjust_labels_for_offset(original_labels, label_dictionary, encodings):

    # convert to the numeric encoding of the label
    labels = [[label_dictionary[label] for label in doc] for doc in original_labels]

    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):

        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
train_labels = adjust_labels_for_offset(dataset['train_labels'], 
                                        label_encoding, 
                                        train_encodings)

test_labels = adjust_labels_for_offset(dataset['test_labels'],
                                      label_encoding, 
                                      test_encodings)

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}
id_to_label[-100] = 'X'

#print( tokenizer.convert_ids_to_tokens(encoding_example) )
#print([id_to_label[id] for id in train_labels[0][0:9]])
list(zip( tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0][0:15]), [id_to_label[id] for id in train_labels[0][0:15]], train_labels[0][0:15] )) 

[('[CLS]', 'X', -100),
 ('what', 'O', 22),
 ('movies', 'O', 22),
 ('star', 'O', 22),
 ('bruce', 'B-ACTOR', 24),
 ('willis', 'I-ACTOR', 19),
 ('[SEP]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100)]

In [None]:
len(np.unique(train_labels)), len(uniq_labels)

(26, 25)

### Prepare Pytorch Datasets

https://huggingface.co/transformers/custom_datasets.html#ft-trainer


In [None]:
import torch

# pytorch is expecting a certain type of dataset 

class pt_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# remove the offset_mapping
train_encodings.pop("offset_mapping") 
test_encodings.pop("offset_mapping")

train_dataset = pt_dataset(train_encodings, train_labels)
test_dataset = pt_dataset(test_encodings, test_labels)

### Train Model

In [None]:
from transformers import DistilBertForTokenClassification, AutoModelForTokenClassification

# load the pretrained model from huggingface
#model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(uniq_labels))
model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                        num_labels = len(uniq_labels), 
                                                        output_attentions=False,
                                                        output_hidden_states=False)

# def model_init():
#   model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(uniq_labels))
#   #model.to(device) # push to gpu
#   return model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [None]:
folder_name = 'mitmovie_pt_' + model_name.replace('-', '_')
folder_name

'mitmovie_pt_distilbert_base_uncased'

In [None]:
from transformers import Trainer, TrainingArguments

# https://huggingface.co/transformers/main_classes/trainer.html
# boiler plate code from huggingface to launch a trainer instance
# sets directories and baseline configuration for batch sizes and weight decay

training_args = TrainingArguments(
    output_dir = model_dir + folder_name + '/results',          # output directory
    overwrite_output_dir = True,
    evaluation_strategy='epoch',
    num_train_epochs = 3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir = model_dir + folder_name + '/logs',            # directory for storing logs
    logging_steps=100,
    load_best_model_at_end=True
)

trainer = Trainer(
    model = model,                         # the instantiated 🤗 Transformers model to be trained
    #model_init = model_init,
    args = training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset             # evaluation dataset
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [None]:
#torch.cuda.empty_cache()

In [None]:
model.to(device);

In [None]:
from datetime import datetime

start = datetime.now()

trainer.train()

print('Time to train:', datetime.now() - start)

Epoch,Training Loss,Validation Loss
1,0.260431,0.282641
2,0.168793,0.242859
3,0.120766,0.243793


Time to train: 0:02:24.791234


In [None]:
#trainer.evaluate()

In [None]:
trainer.save_model(model_dir + folder_name + '/model')

In [None]:
import os

#os.makedirs(model_dir + 'distilbert_testing')

# torch.save(model.state_dict(), model_dir + 'distilbert_testing/model.pth')
# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
# model_to_save.save_pretrained(model_dir + 'distilbert_testing')
# tokenizer.save_pretrained(model_dir + 'distilbert_testing')
# #model.save_pretrained(model_dir + 'distilbert_testing')

### Evaluate Model

In [None]:
folder_name = 'mitmovie_pt_' + model_name.replace('-', '_')
folder_name

'mitmovie_pt_distilbert_base_uncased'

In [None]:
# from transformers import AutoModelForTokenClassification

# # retreive the saved model 
# model = AutoModelForTokenClassification.from_pretrained(model_dir + folder_name + '/model') # 

In [None]:
#model.load_state_dict(torch.load(model_dir + 'distilbert_testing/model.pth', map_location='cpu'))

In [None]:
# model.eval();

In [None]:
#input_ids = torch.tensor([test_encodings['input_ids'][1]]).to('cpu')

In [None]:
# with torch.no_grad():
#     output = model(input_ids)

# pred_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
# [id_to_label[label] for label in pred_indices.tolist()[0]][0:10], dataset['test_labels'][1]

In [None]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir = model_dir +  'mitmovie_pt_distilbert_uncased/results',          # output directory
#     #overwrite_output_dir = True,
#     evaluation_strategy='epoch',
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir = model_dir +  'mitmovie_pt_distilbert_uncased/logs',            # directory for storing logs
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,                         
#     args=training_args,                  # training arguments, defined above
#     train_dataset = train_dataset,         # training dataset
#     eval_dataset = test_dataset             # evaluation dataset
# )

In [None]:
# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)

In [None]:
metrics

{'eval_loss': 0.24285870790481567}

In [None]:
# convert output which is logits to index of max logit
preds = np.argmax(output, axis=2)
preds.shape

(2443, 43)

In [None]:
# https://medium.com/analytics-vidhya/named-entity-recognition-for-turkish-with-bert-f8ec04a31b0
# this function formats the predictions by removing the padding
# so that we can line it up directly with original data

batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
  for j in range(seq_len):
    # ignore pad_tokens
    if label_ids[i, j] != -100: # torch.nn.CrossEntropyLoss().ignore_index:
      preds_list[i].append(id_to_label[preds[i][j]])

In [None]:
# you can see the number of predicted values and the actual values for a 
# given doc is the same, implying that the predictions line up
# to the actuals because we set labels to -100 for [cls], [sep], and ## subwords
len(preds_list[6]), len(dataset['test_tokens'][6])

(13, 13)

In [None]:
preds_stretched = [label for doc in preds_list for label in doc]
trues_stretched = [label for doc in dataset['test_labels'] for label in doc]

In [None]:
from sklearn.metrics import classification_report

f = open(model_dir + folder_name + '/model'  + '/class_report_test.txt', 'w') 

class_report = classification_report(trues_stretched, preds_stretched)
print(class_report, file=f ) 

f.close()

print(class_report)

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

          B-ACTOR       0.92      0.94      0.93       812
      B-CHARACTER       0.69      0.62      0.65        90
       B-DIRECTOR       0.93      0.88      0.90       456
          B-GENRE       0.95      0.95      0.95      1117
           B-PLOT       0.80      0.76      0.78       491
         B-RATING       0.98      0.97      0.97       500
B-RATINGS_AVERAGE       0.94      0.92      0.93       451
         B-REVIEW       0.45      0.18      0.26        56
           B-SONG       0.73      0.65      0.69        54
          B-TITLE       0.86      0.93      0.89       562
        B-TRAILER       0.82      0.90      0.86        30
           B-YEAR       0.96      0.95      0.95       720
          I-ACTOR       0.93      0.94      0.93       862
      I-CHARACTER       0.67      0.49      0.57        75
       I-DIRECTOR       0.92      0.88      0.90       496
          I-GENRE       0.90      0.74      0.81       

In [None]:
!pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 20.8MB/s eta 0:00:01[K     |███████████████                 | 20kB 27.9MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 24.1MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 18.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.9MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=533c9d01e3de1a920c630db12ee61939e57ea85a72e01f1de6dd83b3f483fc3f
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + folder_name + '/model'  + '/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], preds_list)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

                 precision    recall  f1-score   support

          ACTOR       0.89      0.92      0.91       812
      CHARACTER       0.60      0.58      0.59        90
       DIRECTOR       0.89      0.87      0.88       456
          GENRE       0.92      0.93      0.93      1117
           PLOT       0.71      0.74      0.72       491
         RATING       0.95      0.94      0.94       500
RATINGS_AVERAGE       0.84      0.87      0.85       451
         REVIEW       0.18      0.07      0.10        56
           SONG       0.59      0.54      0.56        54
          TITLE       0.80      0.89      0.84       562
        TRAILER       0.82      0.90      0.86        30
           YEAR       0.94      0.94      0.94       720

      micro avg       0.87      0.88      0.87      5339
      macro avg       0.76      0.77      0.76      5339
   weighted avg       0.86      0.88      0.87      5339



## Bert Model

Documentation for this model can be found: 
https://huggingface.co/transformers/model_doc/bert.html#overview

Following HuggingFace distilbert tutorial: https://huggingface.co/transformers/custom_datasets.html#tok-ner


In [None]:
!pip install transformers==3.5.1

Collecting transformers==3.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 16.9MB/s eta 0:00:01[K     |▌                               | 20kB 23.8MB/s eta 0:00:01[K     |▊                               | 30kB 28.3MB/s eta 0:00:01[K     |█                               | 40kB 19.8MB/s eta 0:00:01[K     |█▎                              | 51kB 18.0MB/s eta 0:00:01[K     |█▌                              | 61kB 19.7MB/s eta 0:00:01[K     |█▊                              | 71kB 12.3MB/s eta 0:00:01[K     |██                              | 81kB 12.9MB/s eta 0:00:01[K     |██▎                             | 92kB 12.7MB/s eta 0:00:01[K     |██▌                             | 102kB 12.3MB/s eta 0:00:01[K     |██▊                             | 112kB 12.3MB/s eta 0:00:01[K     |███                        

### Encode Labels

In [None]:
# get the set of unique labels in the movie dataset
uniq_labels = set([label for doc in dataset['train_labels'] for label in doc])

In [None]:
# assign a number to each label
label_encoding = {label: id for id, label in enumerate(uniq_labels)}
label_encoding

{'B-ACTOR': 13,
 'B-CHARACTER': 24,
 'B-DIRECTOR': 19,
 'B-GENRE': 14,
 'B-PLOT': 12,
 'B-RATING': 20,
 'B-RATINGS_AVERAGE': 1,
 'B-REVIEW': 2,
 'B-SONG': 3,
 'B-TITLE': 10,
 'B-TRAILER': 6,
 'B-YEAR': 15,
 'I-ACTOR': 5,
 'I-CHARACTER': 23,
 'I-DIRECTOR': 4,
 'I-GENRE': 7,
 'I-PLOT': 0,
 'I-RATING': 8,
 'I-RATINGS_AVERAGE': 21,
 'I-REVIEW': 9,
 'I-SONG': 16,
 'I-TITLE': 18,
 'I-TRAILER': 11,
 'I-YEAR': 22,
 'O': 17}

### Encode Texts

To encode the texts, we have to use the same Tokenizer that DistilBert was trained on.

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
print('There are %s words in this tokenizer object' % tokenizer.vocab_size)

There are 30522 words in this tokenizer object


In [None]:
# use the tokenizer to encode the texts 
train_encodings = tokenizer(dataset['train_tokens'], 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

test_encodings = tokenizer(dataset['test_tokens'], 
                           is_split_into_words=True, 
                           return_offsets_mapping=True, 
                           padding=True, 
                           truncation=True)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [None]:
# make sure same number of docs
len(train_encodings['input_ids']), len(dataset['train_tokens'])

(9775, 9775)

In [None]:
dataset['train_tokens'][0]

['what', 'movies', 'star', 'bruce', 'willis']

In [None]:
# preview what the encoded result looks like
list(zip(train_encodings['input_ids'][0][0:12], train_encodings['attention_mask'][0][0:12], train_encodings['offset_mapping'][0][0:12]))

[(101, 1, (0, 0)),
 (2054, 1, (0, 4)),
 (5691, 1, (0, 6)),
 (2732, 1, (0, 4)),
 (5503, 1, (0, 5)),
 (12688, 1, (0, 6)),
 (102, 1, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0))]

In [None]:
# the model expects all docs to be same length (51) 
# the attention mask will tell the model to ignore the padding with zeros
print('Length of sequences is %s ' % len(train_encodings['input_ids'][1]))

Length of sequences is 51 


In [None]:
# first document in dataset
print( dataset['train_tokens'][0] )

# check out new tokenization result as words
print( tokenizer.convert_ids_to_tokens( train_encodings['input_ids'][0][0:9]) )

# check out new tokenization result as ids
print( [val for val in train_encodings['input_ids'][0] if val != 0] )

['what', 'movies', 'star', 'bruce', 'willis']
['[CLS]', 'what', 'movies', 'star', 'bruce', 'willis', '[SEP]', '[PAD]', '[PAD]']
[101, 2054, 5691, 2732, 5503, 12688, 102]


In [None]:
find_subwords = []
for i, offset_list in enumerate(train_encodings['offset_mapping']):
  for j, offset_tuple in enumerate(offset_list):
    if offset_tuple[0] != 0:
      find_subwords.append(i)

In [None]:
np.unique(find_subwords)

array([   1,    2,    3, ..., 9752, 9767, 9772])

In [None]:
dataset['train_tokens'][1]

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']

In [None]:
list(zip(train_encodings['input_ids'][1][0:12], train_encodings['attention_mask'][1][0:12], train_encodings['offset_mapping'][1][0:12]))

[(101, 1, (0, 0)),
 (2265, 1, (0, 4)),
 (2033, 1, (0, 2)),
 (3152, 1, (0, 5)),
 (2007, 1, (0, 4)),
 (3881, 1, (0, 4)),
 (6287, 1, (0, 5)),
 (5974, 1, (5, 9)),
 (2013, 1, (0, 4)),
 (1996, 1, (0, 3)),
 (3865, 1, (0, 5)),
 (102, 1, (0, 0))]

In [None]:
[tokenizer.convert_ids_to_tokens(val) for val in train_encodings['input_ids'][1][0:12]]

['[CLS]',
 'show',
 'me',
 'films',
 'with',
 'drew',
 'barry',
 '##more',
 'from',
 'the',
 '1980s',
 '[SEP]']

### Adjust Labels for Vocab Offset


In the print out above you can see that "barrymore" gets transformed into "barry", "##more". This is because the name is not in the vocabulary of the model, however, those small subwords "barry" and "##more" are. So the offsetting notifies the model of this splitting of the word. However, we have to adjust the labels now to account for this separation. 

Based on https://datascience.stackexchange.com/questions/69640/what-should-be-the-labels-for-subword-tokens-in-bert-for-ner-task we will not drag the label to the new subword feature because that would introduce more instances of that class and change the number of support instances thus making the models difficult to compare. 

Also some comments to consider for an alternative strategy: https://github.com/google-research/bert/issues/646


In [None]:
import numpy as np

def adjust_labels_for_offset(original_labels, label_dictionary, encodings):

    # convert to the numeric encoding of the label
    labels = [[label_dictionary[label] for label in doc] for doc in original_labels]

    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):

        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
train_labels = adjust_labels_for_offset(dataset['train_labels'], 
                                        label_encoding, 
                                        train_encodings)

test_labels = adjust_labels_for_offset(dataset['test_labels'],
                                      label_encoding, 
                                      test_encodings)

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}
id_to_label[-100] = 'X'

#print( tokenizer.convert_ids_to_tokens(encoding_example) )
#print([id_to_label[id] for id in train_labels[0][0:9]])
list(zip( tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0][0:15]), [id_to_label[id] for id in train_labels[0][0:15]], train_labels[0][0:15] )) 

[('[CLS]', 'X', -100),
 ('what', 'O', 17),
 ('movies', 'O', 17),
 ('star', 'O', 17),
 ('bruce', 'B-ACTOR', 13),
 ('willis', 'I-ACTOR', 5),
 ('[SEP]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100)]

In [None]:
np.unique(train_labels)

array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
         10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
         21,   22,   23,   24])

### Prepare Pytorch Datasets

https://huggingface.co/transformers/custom_datasets.html#ft-trainer


In [None]:
import torch

# pytorch is expecting a certain type of dataset 

class pt_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# remove the offset_mapping
train_encodings.pop("offset_mapping") 
test_encodings.pop("offset_mapping")

train_dataset = pt_dataset(train_encodings, train_labels)
test_dataset = pt_dataset(test_encodings, test_labels)

### Train Model

In [None]:
from transformers import BertForTokenClassification

# load the pretrained model from huggingface
#model = BertForTokenClassification.from_pretrained('bert-large-uncased', num_labels=len(uniq_labels), cache_dir= model_dir +  'mitmovie_pt_bert_uncased/cache')
def model_init():
  model = BertForTokenClassification.from_pretrained('bert-large-uncased', num_labels=len(uniq_labels))
  #model.to(device) # push to gpu
  return model

In [None]:
!pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 21.2MB/s eta 0:00:01[K     |███████████████                 | 20kB 28.7MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 29.6MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 20.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.8MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=c5f93fe9ee9c175e9802a6b7749d9b4fbe30b4387a2f897f7ba4b74bd667589f
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
#from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# function for computing monitoring metrics during training

def compute_metrics(p):

        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        preds_stretched = [label for doc in true_predictions for label in doc]
        trues_stretched = [label for doc in true_labels for label in doc]

        return {
            "accuracy_score": accuracy_score(trues_stretched, preds_stretched),
            "precision": precision_score(trues_stretched, preds_stretched, average='macro'),
            "recall": recall_score(trues_stretched, preds_stretched, average='macro'),
            "f1": f1_score(trues_stretched, preds_stretched, average='macro'),
        }



In [None]:
from transformers import Trainer, TrainingArguments

# boiler plate code from huggingface to launch a trainer instance
# sets directories and baseline configuration for batch sizes and weight decay

training_args = TrainingArguments(
    output_dir = model_dir +  'mitmovie_pt_bert_uncased/results',          # output directory
    overwrite_output_dir = True,
    num_train_epochs = 10,              # total number of training epochs
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = 32,  # batch size per device during training
    per_device_eval_batch_size = 64,   # batch size for evaluation
    warmup_steps= 500,                # number of warmup steps for learning rate scheduler
    weight_decay= 0.00,               # strength of weight decay
    learning_rate = 3e-5,
    logging_dir = model_dir +  'mitmovie_pt_bert_uncased/logs',            # directory for storing logs
    logging_steps=10,
    #load_best_model_at_end = True,
    #metric_for_best_model = 'eval_f1'
)

trainer = Trainer(
    #model=model,                         # the instantiated 🤗 Transformers model to be trained
    model_init = model_init,
    args=training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset,             # evaluation dataset
    compute_metrics = compute_metrics
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [None]:
from datetime import datetime

start = datetime.now()

# record the results for each run
training_results = trainer.train()

print('Time to train:', datetime.now() - start)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large

Epoch,Training Loss,Validation Loss,Accuracy Score,Precision,Recall,F1
1,0.283661,0.289162,0.934052,0.772672,0.714371,0.73169
2,0.254776,0.251477,0.941789,0.78242,0.783112,0.777286
3,0.161359,0.253897,0.943166,0.797175,0.772685,0.780664
4,0.0952,0.269191,0.944017,0.814949,0.776235,0.785371
5,0.073114,0.283807,0.945759,0.801928,0.787206,0.792692
6,0.049426,0.303408,0.944543,0.798888,0.79423,0.795184
7,0.023676,0.332908,0.945192,0.799014,0.791555,0.792475
8,0.022968,0.362411,0.944098,0.838828,0.799285,0.802444
9,0.022095,0.391702,0.945192,0.807565,0.791872,0.797737
10,0.007721,0.401736,0.945313,0.844477,0.793222,0.8028


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Time to train: 0:27:52.543799


In [None]:
trainer.save_model(model_dir + 'mitmovie_pt_bert_uncased/model')
#model.save_pretrained(model_dir + 'mitmovie_pt_bert_uncased/model')

### Evaluate Model

In [None]:
# model = None
# trainer = None

In [None]:
# from transformers import BertForTokenClassification

# # retreive the saved model 
# model = BertForTokenClassification.from_pretrained(model_dir + 'mitmovie_pt_bert_uncased/model', num_labels=len(uniq_labels))

In [None]:
# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset = train_dataset,         # training dataset
#     eval_dataset = test_dataset             # evaluation dataset
# )

In [None]:
# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)

In [None]:
metrics
# 0.24249209463596344

{'eval_accuracy_score': 0.9453131329498501,
 'eval_f1': 0.802799642924628,
 'eval_loss': 0.40173640847206116,
 'eval_precision': 0.8444770060074938,
 'eval_recall': 0.7932217764158566}

In [None]:
# convert output which is logits to index of max logit
preds = np.argmax(output, axis=2)
preds.shape

(2443, 43)

In [None]:
# https://medium.com/analytics-vidhya/named-entity-recognition-for-turkish-with-bert-f8ec04a31b0
# this function formats the predictions by removing the padding
# so that we can line it up directly with original data

batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
  for j in range(seq_len):
    # ignore pad_tokens
    if label_ids[i, j] != -100: # torch.nn.CrossEntropyLoss().ignore_index:
      preds_list[i].append(id_to_label[preds[i][j]])

In [None]:
# you can see the number of predicted values and the actual values for a 
# given doc is the same, implying that the predictions line up
# to the actuals because we set labels to -100 for [cls], [sep], and ## subwords
len(preds_list[6]), len(dataset['test_tokens'][6])

(13, 13)

In [None]:
preds_stretched = [label for doc in preds_list for label in doc]
trues_stretched = [label for doc in dataset['test_labels'] for label in doc]

In [None]:
from sklearn.metrics import classification_report

f = open(model_dir + 'mitmovie_pt_bert_uncased/model'  + '/class_report_test.txt', 'w') 

class_report = classification_report(trues_stretched, preds_stretched)
print(class_report, file=f ) 

f.close()

print(class_report)

                   precision    recall  f1-score   support

          B-ACTOR       0.94      0.96      0.95       812
      B-CHARACTER       0.66      0.74      0.70        90
       B-DIRECTOR       0.94      0.91      0.92       456
          B-GENRE       0.94      0.96      0.95      1117
           B-PLOT       0.76      0.76      0.76       491
         B-RATING       0.96      0.96      0.96       500
B-RATINGS_AVERAGE       0.94      0.91      0.92       451
         B-REVIEW       0.40      0.45      0.42        56
           B-SONG       0.73      0.76      0.75        54
          B-TITLE       0.90      0.91      0.90       562
        B-TRAILER       0.84      0.90      0.87        30
           B-YEAR       0.93      0.96      0.94       720
          I-ACTOR       0.94      0.95      0.94       862
      I-CHARACTER       0.66      0.60      0.63        75
       I-DIRECTOR       0.95      0.89      0.92       496
          I-GENRE       0.84      0.76      0.80       

In [None]:
!pip install seqeval



In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + 'mitmovie_pt_bert_uncased/model'  + '/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], preds_list)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

                 precision    recall  f1-score   support

          ACTOR       0.92      0.95      0.94       812
      CHARACTER       0.62      0.72      0.67        90
       DIRECTOR       0.93      0.90      0.92       456
          GENRE       0.90      0.93      0.91      1117
           PLOT       0.69      0.74      0.71       491
         RATING       0.93      0.94      0.93       500
RATINGS_AVERAGE       0.87      0.86      0.86       451
         REVIEW       0.34      0.39      0.37        56
           SONG       0.61      0.67      0.64        54
          TITLE       0.86      0.89      0.88       562
        TRAILER       0.82      0.90      0.86        30
           YEAR       0.92      0.95      0.93       720

      micro avg       0.87      0.89      0.88      5339
      macro avg       0.79      0.82      0.80      5339
   weighted avg       0.87      0.89      0.88      5339



In [None]:
with open(model_dir + 'mitmovie_pt_bert_uncased/' + 'preds_stretched.pickle', 'wb') as handle:
    pickle.dump(preds_stretched, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(model_dir + 'mitmovie_pt_bert_uncased/' + 'trues_stretched.pickle', 'wb') as handle:
    pickle.dump(trues_stretched, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Roberta Model

Documentation for this model can be found: 
https://huggingface.co/transformers/model_doc/roberta.html

Following HuggingFace distilbert tutorial: https://huggingface.co/transformers/custom_datasets.html#tok-ner


In [None]:
!pip install transformers



### Encode Labels

In [None]:
# get the set of unique labels in the movie dataset
uniq_labels = set([label for doc in dataset['train_labels'] for label in doc])

In [None]:
# assign a number to each label
label_encoding = {label: id for id, label in enumerate(uniq_labels)}
label_encoding

{'B-ACTOR': 13,
 'B-CHARACTER': 12,
 'B-DIRECTOR': 8,
 'B-GENRE': 19,
 'B-PLOT': 23,
 'B-RATING': 16,
 'B-RATINGS_AVERAGE': 14,
 'B-REVIEW': 4,
 'B-SONG': 7,
 'B-TITLE': 17,
 'B-TRAILER': 22,
 'B-YEAR': 18,
 'I-ACTOR': 6,
 'I-CHARACTER': 5,
 'I-DIRECTOR': 24,
 'I-GENRE': 11,
 'I-PLOT': 0,
 'I-RATING': 3,
 'I-RATINGS_AVERAGE': 2,
 'I-REVIEW': 9,
 'I-SONG': 10,
 'I-TITLE': 20,
 'I-TRAILER': 1,
 'I-YEAR': 15,
 'O': 21}

### Encode Texts

To encode the texts, we have to use the same Tokenizer that Roberta was trained on.

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large', add_prefix_space=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
print('There are %s words in this tokenizer object' % tokenizer.vocab_size)

There are 50265 words in this tokenizer object


In [None]:
toked = tokenizer.tokenize('what movies start bruce willis')
tokenizer('what movies start bruce willis')

{'input_ids': [0, 99, 4133, 386, 29435, 1755, 40, 354, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# use the tokenizer to encode the texts 
train_encodings = tokenizer(dataset['train_tokens'], 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

test_encodings = tokenizer(dataset['test_tokens'], 
                           is_split_into_words=True, 
                           return_offsets_mapping=True, 
                           padding=True, 
                           truncation=True)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

In [None]:
# make sure same number of docs
len(train_encodings['input_ids']), len(dataset['train_tokens'])

(9775, 9775)

In [None]:
dataset['train_tokens'][0]

['what', 'movies', 'star', 'bruce', 'willis']

In [None]:
# preview what the encoded result looks like
list(zip(train_encodings['input_ids'][0][0:12], train_encodings['attention_mask'][0][0:12], train_encodings['offset_mapping'][0][0:12]))

[(0, 1, (0, 0)),
 (99, 1, (1, 4)),
 (4133, 1, (1, 6)),
 (999, 1, (1, 4)),
 (29435, 1, (1, 3)),
 (1755, 1, (3, 5)),
 (40, 1, (1, 4)),
 (354, 1, (4, 6)),
 (2, 1, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0))]

In [None]:
# the model expects all docs to be same length (56) 
# the attention mask will tell the model to ignore the padding with zeros
print('Length of sequences is %s ' % len(train_encodings['input_ids'][1]))

Length of sequences is 56 


In [None]:
# first document in dataset
print( dataset['train_tokens'][0] )

# check out new tokenization result as words
print( tokenizer.convert_ids_to_tokens( train_encodings['input_ids'][0][0:9]) )

# check out new tokenization result as ids
print( [val for val in train_encodings['input_ids'][0] if val != 0] )

['what', 'movies', 'star', 'bruce', 'willis']
['<s>', 'Ġwhat', 'Ġmovies', 'Ġstar', 'Ġbru', 'ce', 'Ġwill', 'is', '</s>']
[99, 4133, 999, 29435, 1755, 40, 354, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
dataset['train_tokens'][1]

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']

In [None]:
list(zip(train_encodings['input_ids'][1][0:15], train_encodings['attention_mask'][1][0:15], train_encodings['offset_mapping'][1][0:15]))

[(0, 1, (0, 0)),
 (311, 1, (1, 4)),
 (162, 1, (1, 2)),
 (3541, 1, (1, 5)),
 (19, 1, (1, 4)),
 (4855, 1, (1, 4)),
 (2003, 1, (1, 3)),
 (1506, 1, (3, 5)),
 (4321, 1, (5, 9)),
 (31, 1, (1, 4)),
 (5, 1, (1, 3)),
 (5114, 1, (1, 4)),
 (29, 1, (4, 5)),
 (2, 1, (0, 0)),
 (1, 0, (0, 0))]

In [None]:
[(val,tokenizer.convert_ids_to_tokens(val)) for val in train_encodings['input_ids'][1][0:15]]

[(0, '<s>'),
 (311, 'Ġshow'),
 (162, 'Ġme'),
 (3541, 'Ġfilms'),
 (19, 'Ġwith'),
 (4855, 'Ġdrew'),
 (2003, 'Ġbar'),
 (1506, 'ry'),
 (4321, 'more'),
 (31, 'Ġfrom'),
 (5, 'Ġthe'),
 (5114, 'Ġ1980'),
 (29, 's'),
 (2, '</s>'),
 (1, '<pad>')]

In [None]:
#set([offsets for doc in train_encodings['offset_mapping'] for offsets in doc])
collect = []
for labels,offsets in zip(dataset['train_labels'], train_encodings['offset_mapping']):
  collect.append((len(labels), len([offset for offset in offsets if offset[0] == 1 ])))

In [None]:
problems = [ (i,col) for (i, col) in enumerate(collect) if col[0] != col[1]]

In [None]:
problems[0:10]

[(10, (5, 6)),
 (16, (7, 8)),
 (17, (8, 9)),
 (19, (11, 12)),
 (20, (8, 9)),
 (26, (12, 14)),
 (29, (6, 7)),
 (30, (7, 8)),
 (34, (11, 12)),
 (40, (6, 7))]

In [None]:
dataset['train_tokens'][10]

['what', 'movie', 'is', 'references', 'zydrate']

In [None]:
[(val,tokenizer.convert_ids_to_tokens(val)) for val in train_encodings['input_ids'][10][0:15]]

[(0, '<s>'),
 (99, 'Ġwhat'),
 (1569, 'Ġmovie'),
 (16, 'Ġis'),
 (13115, 'Ġreferences'),
 (992, 'Ġz'),
 (9611, 'yd'),
 (7954, 'rate'),
 (2, '</s>'),
 (1, '<pad>'),
 (1, '<pad>'),
 (1, '<pad>'),
 (1, '<pad>'),
 (1, '<pad>'),
 (1, '<pad>')]

In [None]:
list(zip(train_encodings['input_ids'][10][0:15], train_encodings['attention_mask'][10][0:15], train_encodings['offset_mapping'][10][0:15]))

[(0, 1, (0, 0)),
 (99, 1, (1, 4)),
 (1569, 1, (1, 5)),
 (16, 1, (1, 2)),
 (13115, 1, (1, 10)),
 (992, 1, (1, 1)),
 (9611, 1, (1, 3)),
 (7954, 1, (3, 7)),
 (2, 1, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0)),
 (1, 0, (0, 0))]

In [None]:
tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][10][0:15])

['<s>',
 'Ġwhat',
 'Ġmovie',
 'Ġis',
 'Ġreferences',
 'Ġz',
 'yd',
 'rate',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [None]:
for i, token in enumerate(tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][10][0:15])):
  if ('Ġ' not in token) and ('<' not in token):
    print(i)

6
7


### Adjust Labels for Vocab Offset


 

Based on https://datascience.stackexchange.com/questions/69640/what-should-be-the-labels-for-subword-tokens-in-bert-for-ner-task we will not drag the label to the new subword feature because that would introduce more instances of that class and change the number of support instances thus making the models difficult to compare. 


In [None]:
import numpy as np

def adjust_labels_for_offset(original_labels, label_dictionary, encodings):

    # convert to the numeric encoding of the label
    labels = [[label_dictionary[label] for label in doc] for doc in original_labels]

    encoded_labels = []
    for doc_labels, doc_ids in zip(labels, encodings.input_ids):

        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_ids),dtype=int) * -100
        arr_tokens = tokenizer.convert_ids_to_tokens(doc_ids)
        labeled_tokens = [i for (i,token) in enumerate(arr_tokens) if ('Ġ' in token)]

        # set labels to tokens who are "primary" tokens
        doc_enc_labels[ labeled_tokens ] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
train_labels = adjust_labels_for_offset(dataset['train_labels'], 
                                        label_encoding, 
                                        train_encodings)

test_labels = adjust_labels_for_offset(dataset['test_labels'],
                                      label_encoding, 
                                      test_encodings)

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}
id_to_label[-100] = 'X'

#print( tokenizer.convert_ids_to_tokens(encoding_example) )
#print([id_to_label[id] for id in train_labels[0][0:9]])
list(zip( tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0][0:15]), [id_to_label[id] for id in train_labels[0][0:15]], train_labels[0][0:15] )) 

[('<s>', 'X', -100),
 ('Ġwhat', 'O', 21),
 ('Ġmovies', 'O', 21),
 ('Ġstar', 'O', 21),
 ('Ġbru', 'B-ACTOR', 13),
 ('ce', 'X', -100),
 ('Ġwill', 'I-ACTOR', 6),
 ('is', 'X', -100),
 ('</s>', 'X', -100),
 ('<pad>', 'X', -100),
 ('<pad>', 'X', -100),
 ('<pad>', 'X', -100),
 ('<pad>', 'X', -100),
 ('<pad>', 'X', -100),
 ('<pad>', 'X', -100)]

In [None]:
np.unique(train_labels)

array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
         10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
         21,   22,   23,   24])

### Prepare Pytorch Datasets

https://huggingface.co/transformers/custom_datasets.html#ft-trainer


In [None]:
import torch

# pytorch is expecting a certain type of dataset 

class pt_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# remove the offset_mapping
train_encodings.pop("offset_mapping") 
test_encodings.pop("offset_mapping")

train_dataset = pt_dataset(train_encodings, train_labels)
test_dataset = pt_dataset(test_encodings, test_labels)

### Train Model

In [None]:
from transformers import RobertaForTokenClassification

# load the pretrained model from huggingface
model = RobertaForTokenClassification.from_pretrained('roberta-large', num_labels=len(uniq_labels))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be ab

In [None]:
from transformers import Trainer, TrainingArguments

# boiler plate code from huggingface to launch a trainer instance
# sets directories and baseline configuration for batch sizes and weight decay

training_args = TrainingArguments(
    output_dir = model_dir +  'mitmovie_pt_roberta_lg/results',          # output directory
    overwrite_output_dir = True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir = model_dir +  'mitmovie_pt_roberta_lg/logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset             # evaluation dataset
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [None]:
from datetime import datetime

start = datetime.now()

trainer.train()

print('Time to train:', datetime.now() - start)

Step,Training Loss
10,3.443513
20,3.305709
30,2.939076
40,2.147003
50,1.741473
60,1.45231
70,1.234856
80,1.008473
90,0.865648
100,0.706747


Time to train: 0:09:47.493211


In [None]:
trainer.save_model(model_dir + 'mitmovie_pt_roberta_lg/model')
#model.save_pretrained(model_dir + 'mitmovie_pt_roberta_lg/model')

### Evaluate Model

In [None]:
model = None
trainer = None

In [None]:
# retreive the saved model 
model = RobertaForTokenClassification.from_pretrained(model_dir + 'mitmovie_pt_roberta_lg/model', num_labels=len(uniq_labels))

In [None]:
model.eval();

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset             # evaluation dataset
)

In [None]:
# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)

In [None]:
metrics
# 'eval_loss': 0.2402782440185547

{'eval_loss': 0.24496054649353027}

In [None]:
# convert output which is logits to index of max logit
preds = np.argmax(output, axis=2)
preds.shape

(2443, 46)

In [None]:
# https://medium.com/analytics-vidhya/named-entity-recognition-for-turkish-with-bert-f8ec04a31b0
# this function formats the predictions by removing the padding
# so that we can line it up directly with original data

batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
  for j in range(seq_len):
    # ignore pad_tokens
    if label_ids[i, j] != -100: # torch.nn.CrossEntropyLoss().ignore_index:
      preds_list[i].append(id_to_label[preds[i][j]])

In [None]:
# you can see the number of predicted values and the actual values for a 
# given doc is the same, implying that the predictions line up
# to the actuals because we set labels to -100 for [cls], [sep], and ## subwords
len(preds_list[6]), len(dataset['test_tokens'][6])

(13, 13)

In [None]:
preds_stretched = [label for doc in preds_list for label in doc]
trues_stretched = [label for doc in dataset['test_labels'] for label in doc]

In [None]:
from sklearn.metrics import classification_report

f = open(model_dir + 'mitmovie_pt_roberta_lg/model'  + '/class_report_test.txt', 'w') 

class_report = classification_report(trues_stretched, preds_stretched)
print(class_report, file=f ) 

f.close()

print(class_report)

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

          B-ACTOR       0.93      0.94      0.93       812
      B-CHARACTER       0.70      0.77      0.73        90
       B-DIRECTOR       0.92      0.88      0.90       456
          B-GENRE       0.94      0.97      0.95      1117
           B-PLOT       0.80      0.77      0.78       491
         B-RATING       0.98      0.97      0.97       500
B-RATINGS_AVERAGE       0.94      0.92      0.93       451
         B-REVIEW       0.36      0.23      0.28        56
           B-SONG       0.73      0.74      0.73        54
          B-TITLE       0.90      0.90      0.90       562
        B-TRAILER       0.82      0.90      0.86        30
           B-YEAR       0.96      0.95      0.96       720
          I-ACTOR       0.93      0.93      0.93       862
      I-CHARACTER       0.67      0.60      0.63        75
       I-DIRECTOR       0.91      0.88      0.89       496
          I-GENRE       0.89      0.74      0.81       

In [None]:
!pip install seqeval



In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + 'mitmovie_pt_roberta_lg/model'  + '/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], preds_list)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

                 precision    recall  f1-score   support

          ACTOR       0.91      0.93      0.92       812
      CHARACTER       0.63      0.71      0.67        90
       DIRECTOR       0.91      0.88      0.89       456
          GENRE       0.91      0.94      0.93      1117
           PLOT       0.73      0.75      0.74       491
         RATING       0.94      0.93      0.93       500
RATINGS_AVERAGE       0.89      0.89      0.89       451
         REVIEW       0.30      0.20      0.24        56
           SONG       0.49      0.57      0.53        54
          TITLE       0.87      0.88      0.88       562
        TRAILER       0.82      0.90      0.86        30
           YEAR       0.96      0.95      0.95       720

      micro avg       0.88      0.89      0.89      5339
      macro avg       0.78      0.79      0.79      5339
   weighted avg       0.88      0.89      0.89      5339



## XLNet Model

In [None]:
model_version = 'xlnet-base-cased'

In [None]:
!pip install transformers



In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [None]:
torch.cuda.empty_cache()

### Encode Labels

In [None]:
# get the set of unique labels in the movie dataset
uniq_labels = set([label for doc in dataset['train_labels'] for label in doc])

In [None]:
# assign a number to each label
label_encoding = {label: id for id, label in enumerate(uniq_labels)}
label_encoding

{'B-ACTOR': 4,
 'B-CHARACTER': 16,
 'B-DIRECTOR': 0,
 'B-GENRE': 24,
 'B-PLOT': 1,
 'B-RATING': 15,
 'B-RATINGS_AVERAGE': 5,
 'B-REVIEW': 2,
 'B-SONG': 7,
 'B-TITLE': 6,
 'B-TRAILER': 14,
 'B-YEAR': 21,
 'I-ACTOR': 22,
 'I-CHARACTER': 23,
 'I-DIRECTOR': 20,
 'I-GENRE': 13,
 'I-PLOT': 17,
 'I-RATING': 18,
 'I-RATINGS_AVERAGE': 3,
 'I-REVIEW': 8,
 'I-SONG': 11,
 'I-TITLE': 10,
 'I-TRAILER': 19,
 'I-YEAR': 12,
 'O': 9}

### Encode Texts

To encode the texts, we have to use the same Tokenizer that xlnet was trained on.

In [None]:
from transformers import XLNetTokenizer

tokenizer = XLNetTokenizer.from_pretrained(model_version, do_lower_case=True)

In [None]:
print('There are %s words in this tokenizer object' % tokenizer.vocab_size)

There are 32000 words in this tokenizer object


In [None]:
# join the tokenize before passing to tokenizer object

print ("Tokenize the first sentence:")
toked = tokenizer.encode_plus(' '.join(dataset['train_tokens'][0]))
print(toked)
print(tokenizer.convert_ids_to_tokens(toked['input_ids']))

Tokenize the first sentence:
{'input_ids': [113, 3547, 1795, 17, 10997, 1138, 53, 590, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁what', '▁movies', '▁star', '▁', 'bru', 'ce', '▁will', 'is', '<sep>', '<cls>']


In [None]:
#tokenizer.get_vocab()['oph']
tokenizer.pad_token_id

5

In [None]:
#from keras.preprocessing.sequence import pad_sequences
# https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85

def make_xlnet_encodings(text, tokenizer, MAX_LEN):

  input_ids, token_type_ids, attention_mask = [], [], []
  for doc in text:
  #   toked = tokenizer(' '.join(doc))
  #   input_ids.append(toked['input_ids'])
  #   #token_type_ids.append(toked['token_type_ids'])
  #   #attention_mask.append(seq_mask)

  # input_ids = pad_sequences(input_ids, 
  #                           maxlen = 80, 
  #                           dtype="long", 
  #                           truncating="post", 
  #                           padding="post", 
  #                           value = tokenizer.pad_token_id #xlnet padding id is 5
  #                           )

  # attention_mask = []
  # for seq in input_ids:
  #   seq_mask = [float(i != 5) for i in seq]
  #   attention_mask.append(seq_mask)

    encoding = tokenizer.encode_plus(
            ' '.join(doc),
            add_special_tokens=True,
            max_length = 65,
            truncation = True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True
            )

    input_ids.append(encoding['input_ids'])
    attention_mask.append(encoding['attention_mask'])   

  encodings = {'input_ids': input_ids, 
              #'token_type_ids': token_type_ids,
              'attention_mask': attention_mask}
  return encodings

In [None]:
train_encodings = make_xlnet_encodings(dataset['train_tokens'], tokenizer, 50)
test_encodings = make_xlnet_encodings(dataset['test_tokens'], tokenizer, 50)



In [None]:
train_encodings['input_ids'][0]

[5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 113,
 3547,
 1795,
 17,
 10997,
 1138,
 53,
 590,
 4,
 3]

In [None]:
# variation of https://mccormickml.com/2019/09/19/XLNet-fine-tuning/
# from keras.preprocessing.sequence import pad_sequences

# def get_ids_masks(text, tokenizer, MAX_LEN):
#   ''' this function will take a list-of-lists and a tokenizer
#   and return input_ids and attention_masks
#   '''

#   # tokenize according to the tokenizer
#   tokenized_text = [tokenizer.tokenize(' '.join(doc) ) for doc in text]

#   # get ids
#   input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

#   # pad input tokens
#   input_ids = pad_sequences(input_ids, 
#                             maxlen=MAX_LEN, 
#                             dtype="long", 
#                             truncating="post", 
#                             padding="post", 
#                             value = tokenizer.pad_token_id #xlnet padding id is 5
#                             )

#   # Create attention masks
#   attention_masks = []
#   # Create a mask of 1s for each token followed by 0s for padding
#   for seq in input_ids:
#     seq_mask = [float(i != 5) for i in seq]
#     attention_masks.append(seq_mask)

#   return input_ids, attention_masks

In [None]:
# train_ids, train_masks = get_ids_masks(dataset['train_tokens'], tokenizer, 65)
# test_ids, test_masks = get_ids_masks(dataset['test_tokens'], tokenizer, 65)

In [None]:
# first document in dataset
print( dataset['train_tokens'][1] )

# check out new tokenization result as words
print( tokenizer.convert_ids_to_tokens( train_encodings['input_ids'][1]) )

# check out new tokenization result as ids
print( [val for val in train_encodings['input_ids'][1]] )

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']
['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '▁show', '▁me', '▁films', '▁with', '▁drew', '▁bar', 'ry', 'more', '▁from', '▁the', '▁1980', 's', '<sep>', '<cls>']
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 351, 110, 2701, 33, 3767, 1808, 844, 3067, 40, 18, 1910, 23, 4, 3]


In [None]:
collect = []
for labels,ids in zip(dataset['train_labels'], train_encodings['input_ids']):
  tokens = tokenizer.convert_ids_to_tokens(ids)
  collect.append((len(labels), len([token for token in tokens if '▁' in token])))

len(collect)

9775

In [None]:
[i for (i, col) in enumerate(collect) if col[0] != col[1]]

[]

### Adjust Labels for Vocab Offset




Based on https://datascience.stackexchange.com/questions/69640/what-should-be-the-labels-for-subword-tokens-in-bert-for-ner-task we will not drag the label to the new subword feature because that would introduce more instances of that class and change the number of support instances thus making the models difficult to compare. 


In [None]:
import numpy as np

def adjust_labels_for_offset(original_labels, label_dictionary, input_ids):

    # convert to the numeric encoding of the label
    labels = [[label_dictionary[label] for label in doc] for doc in original_labels]

    encoded_labels = []
    for doc_labels, doc_ids in zip(labels, input_ids):

        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_ids),dtype=int) * -100
        arr_tokens = tokenizer.convert_ids_to_tokens(doc_ids)
        labeled_tokens = [i for (i,token) in enumerate(arr_tokens) if '▁' in token]

        # set labels to tokens who are "primary" tokens
        doc_enc_labels[ labeled_tokens ] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
train_labels = adjust_labels_for_offset(dataset['train_labels'], 
                                        label_encoding, 
                                        train_encodings['input_ids'])

test_labels = adjust_labels_for_offset(dataset['test_labels'],
                                      label_encoding, 
                                      test_encodings['input_ids'])

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}
id_to_label[-100] = 'X'

#print( tokenizer.convert_ids_to_tokens(encoding_example) )
#print([id_to_label[id] for id in train_labels[0][0:9]])
list(zip( tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0]), 
         [id_to_label[id] for id in train_labels[0]], 
         train_labels[0],
         train_encodings['attention_mask'][0])) 

In [None]:
np.unique(train_labels)

array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
         10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
         21,   22,   23,   24])

### Prepare Pytorch Datasets

https://huggingface.co/transformers/custom_datasets.html#ft-trainer


In [None]:
import torch

# pytorch is expecting a certain type of dataset 

class pt_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# remove the offset_mapping
#train_encodings.pop("offset_mapping") 
#test_encodings.pop("offset_mapping")

train_dataset = pt_dataset(train_encodings, train_labels)
test_dataset = pt_dataset(test_encodings, test_labels)

In [None]:
len(test_dataset.encodings['input_ids']), len(test_dataset.labels)

(2443, 2443)

### Train Model

In [None]:
from transformers import XLNetForTokenClassification

# load the pretrained model from huggingface
#model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(uniq_labels))
model = XLNetForTokenClassification.from_pretrained(model_version, num_labels=len(uniq_labels), mem_len=1024)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForTokenClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
folder_name = 'mitmovie_' + model_version.replace('-', '_')
folder_name

'mitmovie_xlnet_base_cased'

In [None]:
from transformers import Trainer, TrainingArguments

# boiler plate code from huggingface to launch a trainer instance
# sets directories and baseline configuration for batch sizes and weight decay
trainer = None

training_args = TrainingArguments(
    output_dir = model_dir +   folder_name + '/results',          # output directory
    overwrite_output_dir = True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir = model_dir +  folder_name + '/logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset             # evaluation dataset
)

In [None]:
from datetime import datetime

start = datetime.now()

trainer.train()

print('Time to train:', datetime.now() - start)

Step,Training Loss
10,6.925253
20,4.946571
30,2.682056
40,2.303178
50,2.120535
60,1.936699
70,1.825957
80,1.793875
90,1.766829
100,1.650247


Time to train: 0:05:10.212920


In [None]:
trainer.evaluate()

RuntimeError: ignored

In [None]:
trainer.save_model(model_dir + folder_name + '/model')

### Evaluate Model

In [None]:
from transformers import XLNetForTokenClassification

# retreive the saved model 
model = XLNetForTokenClassification.from_pretrained(model_dir + folder_name + '/model', num_labels=len(uniq_labels))

In [None]:
# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)

RuntimeError: ignored

In [None]:
metrics

In [None]:
# convert output which is logits to index of max logit
preds = np.argmax(output, axis=2)
preds.shape

In [None]:
# https://medium.com/analytics-vidhya/named-entity-recognition-for-turkish-with-bert-f8ec04a31b0
# this function formats the predictions by removing the padding
# so that we can line it up directly with original data

batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
  for j in range(seq_len):
    # ignore pad_tokens
    if label_ids[i, j] != -100: # torch.nn.CrossEntropyLoss().ignore_index:
      preds_list[i].append(id_to_label[preds[i][j]])

In [None]:
# you can see the number of predicted values and the actual values for a 
# given doc is the same, implying that the predictions line up
# to the actuals because we set labels to -100 for [cls], [sep], and ## subwords
len(preds_list[6]), len(dataset['test_tokens'][6])

In [None]:
preds_stretched = [label for doc in preds_list for label in doc]
trues_stretched = [label for doc in dataset['test_labels'] for label in doc]

In [None]:
from sklearn.metrics import classification_report

f = open(model_dir + folder_name + '/model'  + '/class_report_test.txt', 'w') 

class_report = classification_report(trues_stretched, preds_stretched)
print(class_report, file=f ) 

f.close()

print(class_report)

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + folder_name + '/model'  + '/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], preds_list)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

## Unused code

---



---



Don't use the TF versions below.


In [None]:
import tensorflow as tf

# remove the offset_mapping
train_encodings.pop("offset_mapping") 
test_encodings.pop("offset_mapping")


train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))


test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

KeyError: ignored

In [None]:
from transformers import TFDistilBertForTokenClassification, TFTrainer, TFTrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# retrieve the distilbert model
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased', 
                                                           num_labels=len(uniq_labels))

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss) #sparsecategorical crossentropy is default
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

In [None]:
model.save_pretrained(model_dir + "mitmovie_distilbert")

In [None]:
#model.from_pretrained(model_dir + "mitmovie_distilbert")

In [None]:
logits = model.predict(test_dataset)[0]

In [None]:
logits.shape

In [None]:
tf.nn.softmax(logits[0], axis= -1)

In [None]:
preds = []
for row in range(logits.shape[0]):
  pred = np.argmax(tf.nn.softmax(logits[row], axis = -1))
  preds.append(pred)

In [None]:
pred_decode = [id_to_label[pred] for pred in preds]

In [None]:
trues = [label for doc in test_labels for label in doc]
len(trues)

In [None]:
for true, pred in zip(trues[0:100], preds[0:100]):
  if true != -100:
    print(id_to_label[true], '  ', id_to_label[pred])

In [None]:
i = 35
p = model.predict(np.array(X_te[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))

In [None]:
id_to_label[20]

In [None]:
test_true = [label for doc in test_labels for label in doc]

In [None]:
[id_to_label[val] for val in test_true[0:10]]

In [None]:
dataset['test_labels'][0]

In [None]:
pred_vec = tf.nn.softmax(logits, axis=1).numpy()

In [None]:
pred_vec

In [None]:
len([val for val in test_encodings['input_ids']

In [None]:
len( pred_vec[0] )

In [None]:
preds = np.argmax(pred_vec, axis=2)[0]

In [None]:
preds = np.argmax(output, axis=2)
batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
  for j in range(seq_len):
    # ignore pad_tokens
    if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
      preds_list[i].append(label_map[preds[i][j]])