# Tag Analysis

## 0. Set up

In [None]:
!pip install transformers

In [None]:
import re
import os

import numpy as np
import pandas as pd

import torch
from tqdm import tqdm

from transformers import RobertaTokenizer, RobertaConfig, RobertaForMaskedLM, AdamW
from tokenizers import ByteLevelBPETokenizer

## 1. Read data

In [None]:
colnames = pd.read_csv("tags.txt")
colnames = [cn for cn in colnames if cn != 'rec_epoch' and cn != '']
colnames = colnames[2:]
colnames

def split_words(sentence):
    pattern = '([A-Z]{0,1}[a-z]+|[0-9]+|[A-Z]+)'
    return list(map(lambda x: x.lower(), re.findall(pattern, sentence)))

col_parts = [split_words(cn) for cn in colnames]
max_len = max(map(len, col_parts)) * 3
vocab_size = len(np.unique([c for sub in col_parts for c in sub]))

col_sentences = [' '.join(part) for part in col_parts]
col_sentences

In [18]:
f = open("tag_names.txt", "w")
for col in col_sentences:
    f.write(col + "\n")
f.close()

## 2. Train the model

Byte level tokenizer for tags

In [19]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['tag_names.txt'],
                vocab_size=vocab_size,
                min_frequency=1,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

path = './tagbert'
if not os.path.exists(path):
    os.mkdir(path)
tokenizer.save_model('tagbert')






['tagbert/vocab.json', 'tagbert/merges.txt']

Prepare text function

In [20]:
def prepare_text(original_text, tokenizer, mask_per=0):
    tokenized = tokenizer(original_text, max_length=max_len+2, padding='max_length', truncation=True)
    input_ids = np.array(tokenized['input_ids'])
    labels = np.array(tokenized['input_ids'])
    attn_mask = np.array(tokenized['attention_mask'])

    if mask_per > 0:
        rand_mask = np.random.rand(len(input_ids))
        mask_bool = (rand_mask < mask_per) * (input_ids > tokenizer.mask_token_id)
        
        input_ids[mask_bool] = tokenizer.mask_token_id
    
    return input_ids, labels, attn_mask

Roberta tokenizer and Torch inputs, labels, and masks

In [21]:
tokenizer = RobertaTokenizer.from_pretrained('tagbert', max_len=max_len)

input_ids = []
labels = []
masks = []

for sentence in col_sentences:
    inp, lab, msk = prepare_text(sentence, tokenizer, 0.15)

    input_ids.append(inp)
    labels.append(lab)
    masks.append(msk)

input_ids = torch.tensor(input_ids)
labels = torch.tensor(labels)
masks = torch.tensor(masks)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
  input_ids = torch.tensor(input_ids)


Torch data loader

In [22]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

encodings = {'input_ids': input_ids, 'attention_mask': masks, 'labels': labels}
dataset = Dataset(encodings)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

Model setup

In [23]:
config = RobertaConfig(
    vocab_size=vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=100,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

model = RobertaForMaskedLM(config)

Pytorch model

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained('./tagbert')

Epoch 0: 100%|████████████████████████████████████████████████████████████| 185/185 [00:12<00:00, 14.27it/s, loss=0.148]
Epoch 1: 100%|███████████████████████████████████████████████████████████| 185/185 [00:07<00:00, 25.44it/s, loss=0.0671]
