In [None]:
!pip install datasets
!pip3 install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [21]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [22]:
tag2id = {
    'unk': 0,
    'O': 1,
    'B-OBJECT': 2,
    'I-OBJECT': 3,
    'B-PEOPLE': 4,
    'I-PEOPLE': 5,
    'B-LOCATION': 6,
    'B-BRAND': 7,
    'I-BRAND': 8,
    'I-LOCATION': 9,
    'START': 10,
    'STOP': 11,
}
id2tag = {id: tag for tag, id in tag2id.items()}
num_labels = len(tag2id.keys())

In [23]:
from pathlib import Path
import re

def read_data(file_path):
    maxlen = 0
    file_path = Path(file_path)
    raw_text = file_path.read_text(encoding='utf-8').strip()    
    raw_docs = re.split(r'\n\t?\n', raw_text)
    
    token_docs = []
    tag_docs = []
    ner_docs = []
    id_docs = []
    for i,doc in enumerate(raw_docs):
        tokens = []
        tags = []
        ners = []
        for line in doc.split('\n'):
            line_split = line.split(' ')
            if len(line_split) != 2:
                continue
            token, tag = line_split
            tokens.append(token)
            tags.append(tag)
            ners.append(tag2id[tag])
        if len(tokens) == 0:
            print("FAIL 1")
            continue
        maxlen = max(maxlen, len(tokens))
        if len(tokens) != len(tags) or len(ners) != len(tags):
            print("FAIL 2")
            continue
        token_docs.append(tokens)
        tag_docs.append(tags)
        ner_docs.append(ners)
        id_docs.append(i)

    return id_docs, token_docs, tag_docs, ner_docs, maxlen

In [24]:
ids, texts, tags, ners, mlen = read_data('train.txt')
train_data = {
    'id' : ids,
    'tokens' : texts,
    'ner_tags' : ners,
    'tags' : tags,
}
print(mlen)

ids, texts, tags, ners, mlen = read_data('test.txt')
test_data = {
    'id' : ids,
    'tokens' : texts,
    'ner_tags' : ners,
    'tags' : tags,
}
print(mlen)

ids, texts, tags, ners, mlen = read_data('dev.txt')
val_data = {
    'id' : ids,
    'tokens' : texts,
    'ner_tags' : ners,
    'tags' : tags,
}
print(mlen)

FAIL 1
300
300
300


In [25]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
val_dataset = Dataset.from_dict(val_data)

In [26]:
dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['test'] = test_dataset
dataset['validation'] = val_dataset

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'tags'],
        num_rows: 184226
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'tags'],
        num_rows: 23052
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'tags'],
        num_rows: 23053
    })
})

In [28]:
# huggingface's transformers library
from transformers import RobertaForTokenClassification, RobertaTokenizer
roberta_version = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(roberta_version)

In [29]:
def encode_labels(input_ids, tags, max_length=300):
    labels = []
    i = 0
    for input_id in input_ids:
        if input_id == 0:
            labels.append(-100)
        elif input_id == 2:
            labels.append(-100)
        else:
            tok = tokenizer.decode([input_id], clean_up_tokenization_spaces=False)
            if tok[0] != ' ':
                labels.append(-100)
            else:
                labels.append(tags[i])
                i+=1
    labels = labels + [-100] * (max_length - len(labels))
    return labels

In [30]:
def add_encodings(example):
    """Processing the example
    
    Args:
        example (dict): The dataset example.
    
    Returns:
        dict: The dictionary containing the following updates:
            - input_ids: The list of input ids of the tokens.
            - attention_mask: The attention mask list.
            - ner_tags: The updated ner_tags.
    
    """
    # get the encodings of the tokens. The tokens are already split, that is why we must add is_split_into_words=True
    encodings = tokenizer(example['tokens'], truncation=True, padding='max_length',max_length = 300, is_split_into_words=True)
    labels = encode_labels(encodings['input_ids'], example['ner_tags'], max_length=300)
    
    return { **encodings, 'labels': labels }

In [31]:
dataset = dataset.map(add_encodings)

  0%|          | 0/184226 [00:00<?, ?ex/s]

  0%|          | 0/23052 [00:00<?, ?ex/s]

  0%|          | 0/23053 [00:00<?, ?ex/s]

In [32]:
import pickle
with open('roberta_tokenized_dataset.pkl', 'wb') as f:
    pickle.dump(dataset,f)

In [33]:
dataset['train'][0]

{'id': 0,
 'tokens': ['Have',
  'you',
  'seen',
  'it',
  'Pitbull',
  'In',
  'October',
  'We',
  'Wear',
  'Pink',
  'shirt',
  '.',
  'This',
  'tshirts',
  'with',
  'cats',
  'collection',
  'is',
  'truly',
  'worth',
  'barking',
  'about',
  '.',
  'Black',
  'kittens',
  ',',
  'Main',
  'Coon',
  ',',
  'Bengali..you',
  'name',
  'it',
  '.',
  'The',
  'point',
  'is',
  'that',
  'cats',
  'are',
  'forever',
  '.'],
 'ner_tags': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  3,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-OBJECT',
  'I-OBJECT',
  'O',
  'O',
  'B-OBJECT',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'input_ids': [0,
  6319,
  4