Neural network are from the  transformer layers of the BERT classification method
- Model is a type of transformer neural network
- Pytorch is used easily build and train neural network

In [1]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c09978f27db869a8736bec9940a1299c7566aeb15e6745671674dd1f2ab2929c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [3]:
# use GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


- Process data
- IOB tagging used in NER
- Inside Outside Beginning
- "Barack Obama was born in Hawaï"- [B-PERS, I-PERS, O, O, O, B-GEO]

In [4]:
data = pd.read_csv("ner_datasetreference.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
data.count() # no of sentences, words, tags in data

Sentence #      47959
Word          1048565
POS           1048575
Tag           1048575
dtype: int64

Print the unique tags and their frequency

In [6]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 17


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

- [2:5] means PERS in B-PERS
- If already in list of tags increase count
- else add to tags and add current count

In [7]:
tags = {}
# frequencies.index - tags
# frequencies - count
for tag, count in zip(frequencies.index, frequencies): # tags and their counts
    if tag != "O":
        if tag[2:5] not in tags.keys(): # if not in
            tags[tag[2:5]] = count
        else: # if present
            tags[tag[2:5]] += count
    continue

# sort in descending order
# lambda x: x[1] if you get x it return next element x[1]
print(sorted(tags.items(), key=lambda x: x[1], reverse=True))


[('geo', 45058), ('org', 36927), ('per', 34241), ('tim', 26861), ('gpe', 16068), ('art', 699), ('eve', 561), ('nat', 252)]


Dont need tags with art eve and nation, so removing those

In [8]:
entities_to_remove = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
data = data[~data.Tag.isin(entities_to_remove)]
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [9]:
# list with tags and indices
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'I-per': 8,
 'I-gpe': 9,
 'I-tim': 10}

- Training for NER: Sentence with tags

In [10]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


Sentence column: same sentence number concatenate
Word_labels column: same sentence number concatenate

In [11]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Thousands,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Sentence: 1,of,IN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
2,Sentence: 1,demonstrators,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
3,Sentence: 1,have,VBP,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
4,Sentence: 1,marched,VBN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."



Drop the other columns



In [12]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Families of soldiers killed in the conflict jo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-per,O,O,..."
2,They marched from the Houses of Parliament to ...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,I-geo,O"
3,"Police put the number of marchers at 10,000 wh...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,The protest comes on the eve of the annual con...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,O,O,B-org,I-org,O,..."


In [13]:
len(data) # now get the unique sentnence and labels

47571

In [14]:
# verify it with example, sentence and its tags
data.iloc[41].sentence
data.iloc[41].word_labels

'B-gpe,O,O,B-tim,O,B-per,I-per,O,O,O,B-geo,O,O,O,O,O,O,O,O'

DATASET PREP

Define the key variable used for training and evaluation process

In [15]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BERT Wordpiece tokenization not word tokenization
- "Washington" labeled: "b-gpe", tokens: "Wash", "##ing", "##ton",

Convert to Pytorch tensors - same as list but have extra added capabilities like GPU accelerator



In [19]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        # encoding = self.tokenizer(sentence,
        #                      is_pretokenized=True,
        #                      return_offsets_mapping=True,
        #                      padding='max_length',
        #                      truncation=True,
        #                      max_length=self.max_len)
        encoding = self.tokenizer(
            sentence,
            add_special_tokens=True,  # Add [CLS] and [SEP]
            is_split_into_words=True,  # Use this instead of is_pretokenized
            return_offsets_mapping=True,  # Return offsets
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        # step 3: create token labels only for first word pieces of each tokenized word
        # get ID of the label
        labels = [labels_to_ids[label] for label in word_labels]

        # create an empty array number of tokens in the encoded sentence
        # initialize with -100 - a token to indicate ignore certain positions

        # offset_mapping: first and end character position in the sentence
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        # mapping[0] == 0 -> indicates first token of word like 'Wash'
        # if yes assign otherwise it is -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

Divide the dataset 80/20 test-train split

In [20]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (47571, 2)
TRAIN Dataset: (38057, 2)
TEST Dataset: (9514, 2)


Example for first training

In [21]:
training_set[0]

{'input_ids': tensor([  101, 23564, 21030,  2099,  4967,  2001,  9388,  1011,  6109,  2005,
          2634,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [22]:
# print token with their corresponding label
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label)) # format: left-aligned, 10 characters atleast

[CLS]       -100
za          3
##hee       -100
##r         -100
khan        8
was         0
mar         0
-           -100
93          -100
for         0
india       1
.           0
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[

Pytorch dataloader
- Efficiently handle the data loading in batching, shuffling, and other tasks
- Define parameters and use it on sets with the parameters defined

In [23]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
# Use the data loaders with sets and training parameters
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

Model define
- BertForTokenClassification using pre-trained "bert-base-uncased"
- also define the number of labels with the model definition (how many cattegories can it be classified into)
- classification head -> convert bert encoded messages to labels

In [24]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device) # run model on cuda

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

Train the model

- cross entropy: In the context of classification tasks, especially with neural networks, the cross entropy loss measures the difference between two probability distributions:
The predicted probability distribution (output of the model).
The true probability distribution (one-hot encoded labels).

- Implement sanity check: initial loss -ln(1/num of classes)
- Thus loss is -ln(1/17) = 2.83

In [25]:
# get an example
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

# give model as labels as bas for classification
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
# calculte loss
initial_loss = outputs[0]
initial_loss

tensor(2.5324, device='cuda:0', grad_fn=<NllLossBackward0>)

In [26]:
# verify that the logits of the neural network
# shape of (batch_size, sequence_length, num_labels)
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 11])

Define optimizer
- it updates weights and biases during the training process
- learning rate: how much the model change each time the weight is updated


In [27]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

Pytorch training function

Forward Propagation:
- Pass input through network to obtain output
- Input Layer (get input) -> Hidden Layer/s (input * layers weight + bias)activation function - > Output Layer (final predictions of data )

Backward Propagation:
- Calculate gradient of loss with respect to weight in the network (gradients are used to update weights to minimize the loss)
- Calculte loss -> Calaculate gradients (partial derivate of weight by loss) -> Update weights

Gradient clipping: Technique to prevent gradient becaoming too large that it update the model parameter drasstically resulting in poor performance


In [30]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0  # total loss and accuracy of epoch
    nb_tr_examples, nb_tr_steps = 0, 0  # number of examples and steps processed
    tr_preds, tr_labels = [], []  # list of predictions and labels

    # put model in training mode
    model.train()

    # iterate over batches of data from the loader
    for idx, batch in enumerate(training_loader):

        # extract values from the batch
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        # forward pass through model
        # Accessing loss and logits explicitly
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()

        # increment the step count and example count
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        # print loss after each 100 steps
        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1)  # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100  # shape (batch_size * seq_len)

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        # collect labels and predictions for entire epoch
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        # calculate accuracy for entire batch and accumulate it
        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # Control the size of gradient to prevent poor performance
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()  # clear old gradients
        loss.backward()  # backpropagation to compute gradients
        optimizer.step()  # update model parameters based on gradients

    epoch_loss = tr_loss / nb_tr_steps  # average loss for the epoch
    tr_accuracy = tr_accuracy / nb_tr_steps  # average accuracy for epoch
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

# # Defining the training function on the 80% of the dataset for tuning the bert model
# def train(epoch):
#     tr_loss, tr_accuracy = 0, 0 # total loss and accuracy of epoch
#     nb_tr_examples, nb_tr_steps = 0, 0 # number of examples and steps processed
#     tr_preds, tr_labels = [], [] # list of predictions and labels
#     # put model in training mode
#     model.train()

#     # iterate over batches of data from the loader
#     for idx, batch in enumerate(training_loader):

#         # extract values from the batch
#         ids = batch['input_ids'].to(device, dtype = torch.long)
#         mask = batch['attention_mask'].to(device, dtype = torch.long)
#         labels = batch['labels'].to(device, dtype = torch.long)

#         # forward pass through model
#         # tr_logits is model's raw predictions
#         loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
#         tr_loss += loss.item()

#         # increment the step count and example count
#         nb_tr_steps += 1
#         nb_tr_examples += labels.size(0)

#         # print loss after each 100 steps
#         if idx % 100==0:
#             loss_step = tr_loss/nb_tr_steps
#             print(f"Training loss per 100 training steps: {loss_step}")

#         # compute training accuracy
#         flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
#         active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
#         flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

#         # only compute accuracy at active labels
#         active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
#         #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

#         labels = torch.masked_select(flattened_targets, active_accuracy)
#         predictions = torch.masked_select(flattened_predictions, active_accuracy)

#         # collect labels and predictions for entire epoch
#         tr_labels.extend(labels)
#         tr_preds.extend(predictions)

#         # calculate accuracy for entire batch and accumulate it
#         tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
#         tr_accuracy += tmp_tr_accuracy

#         # Control the size of gradient to prevent poor performance
#         # gradient clipping
#         torch.nn.utils.clip_grad_norm_(
#             parameters=model.parameters(), max_norm=MAX_GRAD_NORM
#         )


#         # backward pass
#         optimizer.zero_grad() # clear old ggradients
#         loss.backward() # backpropagation to compute gradients
#         optimizer.step() # update model parameters based on gradients

#     epoch_loss = tr_loss / nb_tr_steps # average loss for the epoch
#     tr_accuracy = tr_accuracy / nb_tr_steps # average accuracy for epoch
#     print(f"Training loss epoch: {epoch_loss}")
#     print(f"Training accuracy epoch: {tr_accuracy}")

In [31]:
# train the model!
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.5412466526031494
Training loss per 100 training steps: 0.8577026732192181
Training loss per 100 training steps: 0.6567317520578703
Training loss per 100 training steps: 0.540824792660352
Training loss per 100 training steps: 0.46240623352905164
Training loss per 100 training steps: 0.4125860333219617
Training loss per 100 training steps: 0.375992380640471
Training loss per 100 training steps: 0.3481288150898825
Training loss per 100 training steps: 0.32443754980515005
Training loss per 100 training steps: 0.30958876062957746
Training loss per 100 training steps: 0.29638276910493067
Training loss per 100 training steps: 0.28430894918090155
Training loss per 100 training steps: 0.27380528405376964
Training loss per 100 training steps: 0.2638135518618128
Training loss per 100 training steps: 0.25561071696925874
Training loss per 100 training steps: 0.24817940137156103
Training loss per 100 training steps: 0.24112398963337817
Traini

KeyboardInterrupt: 

MODEL EVALUATION:
evaluation yhe performance with 20% of test set set held

Logits: raw output from the network before applyong any activation function




In [34]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad(): # diable gradient calculation: it speeds up the process
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            # # forawrd propagtion gets loss and logits
            # loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            # total loss
            eval_loss += loss.item()

            # increment step and example count
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # log average validation loss after every 100 steps
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            # apply mask to filter out padding
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            # collect label and predictions for each evaluation process
            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [35]:
labels, predictions = valid(model, testing_loader) # >93%

Validation loss per 100 evaluation steps: 0.11762971431016922
Validation loss per 100 evaluation steps: 0.08840983291732904
Validation loss per 100 evaluation steps: 0.09957708449007953
Validation loss per 100 evaluation steps: 0.10018578654563556
Validation loss per 100 evaluation steps: 0.0959752924877342
Validation loss per 100 evaluation steps: 0.09904468712000039
Validation loss per 100 evaluation steps: 0.10280370103627653
Validation loss per 100 evaluation steps: 0.10357978712576714
Validation loss per 100 evaluation steps: 0.10249135626849722
Validation loss per 100 evaluation steps: 0.1012562908865067
Validation loss per 100 evaluation steps: 0.10328356151122016
Validation loss per 100 evaluation steps: 0.10374571586985286
Validation loss per 100 evaluation steps: 0.10370088355655671
Validation loss per 100 evaluation steps: 0.10384471752669384
Validation loss per 100 evaluation steps: 0.10380743689019749
Validation loss per 100 evaluation steps: 0.10386109699822123
Validation

Print f1_score and precision using classification report

In [46]:
# from seqeval.metrics import classification_report

# print(classification_report(labels, predictions))
from seqeval.metrics import classification_report

# Convert single flat lists to nested lists
labels_nested = [labels]
predictions_nested = [predictions]

# Now use classification_report with nested lists
report = classification_report(labels_nested, predictions_nested)
print(report)

              precision    recall  f1-score   support

         geo       0.82      0.90      0.86      7378
         gpe       0.95      0.93      0.94      3021
         org       0.69      0.55      0.61      3964
         per       0.73      0.80      0.76      3367
         tim       0.86      0.85      0.85      4070

   micro avg       0.81      0.82      0.81     21800
   macro avg       0.81      0.81      0.80     21800
weighted avg       0.81      0.82      0.81     21800



In [51]:
# example
sentence = "@HuggingFace is a company based in New York, but is also has employees working in Paris"

inputs = tokenizer(sentence.split(),
                    add_special_tokens=True,  # Add [CLS] and [SEP]
                    is_split_into_words=True,  # Use this instead of is_pretokenized
                    return_offsets_mapping=True,  # Return offsets
                    padding='max_length',
                    truncation=True,
                    max_length=MAX_LEN,
                    return_tensors="pt")


# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

['@HuggingFace', 'is', 'a', 'company', 'based', 'in', 'New', 'York,', 'but', 'is', 'also', 'has', 'employees', 'working', 'in', 'Paris']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo']


Save model for future use:

save the vocabulary (.txt) file, model weights (.bin) and the model's configuration (.json) to a directory, so that both the tokenizer and model can be re-loaded using the from_pretrained() class method.

In [53]:
import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('The model is now trained and ready for use')

All files saved
The model is now trained and ready for use
