In [None]:
!pip install transformers
!pip install datasets

In [26]:
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import AdamW, get_cosine_schedule_with_warmup
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from functools import reduce
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
dataset = load_dataset('conll2003') # https://huggingface.co/datasets/conll2003



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [4]:
def add_encodings(example):
    encodings = tokenizer(example['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = example['ner_tags'] + [0] * (tokenizer.model_max_length - len(example['ner_tags']))
    return { **encodings, 'labels': labels }

num_labels = dataset['train'].features['ner_tags'].feature.num_classes
dataset = dataset.map(add_encodings)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [5]:
labels = dataset['train'].features['ner_tags'].feature
label2id = { k: labels.str2int(k) for k in labels.names }
id2label = { v: k for k, v in label2id.items() }

In [6]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [7]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

# RoBERTa

In [8]:
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [9]:
len(dataset['train']), len(dataset['validation']), len(dataset['test'])

(14041, 3250, 3453)

In [11]:
train_data = dataset['train'].select(range(1000))
val_data = dataset['validation'].select(range(200))
test_data = dataset['test'].select(range(500))

In [12]:
len(train_data), len(val_data), len(test_data)

(1000, 200, 500)

In [13]:
train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=4, shuffle=True)

In [14]:
total_epochs = 5

optimizer = AdamW(model.parameters(), lr=1e-5)

num_train_steps = len(train_dataloader) * total_epochs
num_warmup_steps = int(num_train_steps * 0.1) 

lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                               num_warmup_steps=num_warmup_steps,
                                               num_training_steps = num_train_steps)



In [15]:
def train(dataloader, optimizer, scheduler, device):
    global model
    model.train()
    model.to(device)
    
    prediction_labels = []
    true_labels = []
    
    total_loss = []
    
    for batch in tqdm(dataloader):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        total_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # prevent exploding gradient

        optimizer.step()
        scheduler.step()
        
        prediction_labels += logits.argmax(axis=-1).flatten().tolist()
    
    return true_labels, prediction_labels, total_loss

def validation(dataloader, device):
    global model
    model.eval()
    model.to(device)
    
    prediction_labels = []
    true_labels = []
    
    total_loss = []
    
    for batch in tqdm(dataloader):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss.append(loss.item())

            prediction_labels += logits.argmax(axis=-1).flatten().tolist()
        
    return true_labels, prediction_labels, total_loss

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

for epoch in range(total_epochs):
    y, y_pred, train_loss = train(train_dataloader, optimizer, lr_scheduler, device)
    train_acc = accuracy_score(y, y_pred)
    
    y, y_pred, val_loss = validation(val_dataloader, device)
    val_acc = accuracy_score(y, y_pred)
    
    all_loss['train_loss'] += train_loss
    all_loss['val_loss'] += val_loss
    
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)
    
    print(f'Epoch: {epoch}, train_loss: {torch.tensor(train_loss).mean():.3f}, train_acc: {train_acc:.3f}, val_loss: {torch.tensor(val_loss).mean():.3f}, val_acc: {val_acc:.3f}') 

100%|██████████| 250/250 [01:42<00:00,  2.44it/s]
100%|██████████| 50/50 [00:06<00:00,  7.99it/s]


Epoch: 0, train_loss: 0.428, train_acc: 0.901, val_loss: 0.024, val_acc: 0.994


100%|██████████| 250/250 [01:44<00:00,  2.39it/s]
100%|██████████| 50/50 [00:06<00:00,  7.99it/s]


Epoch: 1, train_loss: 0.021, train_acc: 0.994, val_loss: 0.021, val_acc: 0.994


100%|██████████| 250/250 [01:44<00:00,  2.39it/s]
100%|██████████| 50/50 [00:06<00:00,  8.03it/s]


Epoch: 2, train_loss: 0.018, train_acc: 0.995, val_loss: 0.019, val_acc: 0.994


100%|██████████| 250/250 [01:45<00:00,  2.37it/s]
100%|██████████| 50/50 [00:06<00:00,  8.05it/s]


Epoch: 3, train_loss: 0.017, train_acc: 0.995, val_loss: 0.019, val_acc: 0.994


100%|██████████| 250/250 [01:44<00:00,  2.38it/s]
100%|██████████| 50/50 [00:06<00:00,  8.05it/s]

Epoch: 4, train_loss: 0.016, train_acc: 0.995, val_loss: 0.019, val_acc: 0.994





In [40]:
_, y_pred, _ = validation(test_dataloader, device)
gt = []
for _each in test_data:
  x = list(_each['labels'].numpy())
  gt.append(x)
out = reduce(lambda x, y: x + y, gt)
print("Accuracy: {}".format(accuracy_score(out, y_pred)))

100%|██████████| 125/125 [00:16<00:00,  7.67it/s]


Accuracy: 0.9928828125


In [56]:
model.eval()
input_tokens = [["France", "London"]]
inputs = tokenizer(input_tokens, truncation=True, padding='max_length', is_split_into_words=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = {k: torch.tensor(v).type(torch.long).to(device) for k, v in inputs.items()}
prediction_labels = []
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    prediction_labels += logits.argmax(axis=-1).flatten().tolist()
print(prediction_labels)

[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 