In [1]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
import os

from torch.utils.data import DataLoader
from torch.optim import SGD
from transformers import BertTokenizerFast, BertForTokenClassification

In [3]:
print(torch.version.cuda)

11.8


In [4]:
df = pd.read_csv('/kaggle/input/standard-ner-dataset/standard_NER.csv')
print(f"df.shape: {df.shape}")
df.head()

df.shape: (47959, 2)


Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [5]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

print(labels_to_ids)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


In [6]:
# Copying the DataFrame to avoid modifying the original one
two_label_df = df.copy()

# Modifying the 'labels' column
two_label_df['labels'] = two_label_df['labels'].apply(lambda label: ' '.join(['N' if tag != 'O' else tag for tag in label.split()]))

two_labels_to_ids = {'N': 0, 'O': 1}
two_ids_to_labels = {0: 'N', 1: 'O'}

two_label_df.head()
print(two_labels_to_ids)

{'N': 0, 'O': 1}


In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def align_label(texts, labels, labels_to_ids_mapping):
  tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
  word_ids = tokenized_inputs.word_ids()
  previous_word_idx = None
  label_ids = []
  for word_idx in word_ids:
    if word_idx is None:
      label_ids.append(-100)
    elif word_idx != previous_word_idx:
      try:
        label_ids.append(labels_to_ids_mapping[labels[word_idx]])
      except:
        label_ids.append(-100)
    else:
      try:
        label_ids.append(labels_to_ids_mapping[labels[word_idx]])
      except:
        label_ids.append(-100)
    previous_word_idx = word_idx
  return label_ids


class DataSequence(torch.utils.data.Dataset):
  def __init__(self, df, labels_to_ids_mapping):
    lb = [i.split() for i in df['labels'].values.tolist()]
    txt = df['text'].values.tolist()
    self.texts = [tokenizer(str(i), padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
    self.labels = [align_label(i, j, labels_to_ids_mapping) for i,j in zip(txt, lb)]

  def __len__(self):
    return len(self.labels)
  
  def get_batch_data(self, idx):
    return self.texts[idx]
  
  def get_batch_labels(self, idx):
    return torch.LongTensor(self.labels[idx])
  
  def __getitem__(self, idx):
    batch_data = self.get_batch_data(idx)
    batch_labels = self.get_batch_labels(idx)
    return batch_data, batch_labels

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df)), int(.9 * len(df))])
print(f"df_train.shape: {df_train.shape}")
print(f"df_val.shape: {df_val.shape}")
print(f"df_test.shape: {df_test.shape}")

df_train.shape: (38367, 2)
df_val.shape: (4796, 2)
df_test.shape: (4796, 2)


  return bound(*args, **kwds)


In [9]:
two_label_df_train, two_label_df_val, two_label_df_test = np.split(two_label_df.sample(frac=1, random_state=42), [int(.8 * len(two_label_df)), int(.9 * len(two_label_df))])
print(f"df_train.shape: {two_label_df_train.shape}")
print(f"df_val.shape: {two_label_df_val.shape}")
print(f"df_test.shape: {two_label_df_test.shape}")

df_train.shape: (38367, 2)
df_val.shape: (4796, 2)
df_test.shape: (4796, 2)


In [10]:
class BertModel(torch.nn.Module):

  def __init__(self):
    super(BertModel, self).__init__()
    self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
  
  def forward(self, input_id, mask, label):
    output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
    return output

In [11]:
# To prevent parallelization warnings set true for parallelizing or false for not
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [12]:
# Load the entire PyTorch model
fine_tuned_model = torch.load(f"/kaggle/input/ner-with-bert-models/NER_With_Bert_10Epoch_FullData_10Batch.pth")
standard_model = BertForTokenClassification.from_pretrained("bert-base-cased")

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import BertForTokenClassification

# Replace 'bert-base-uncased' with the actual model you are using
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
num_labels = model.config.num_labels
print(f"Number of labels expected by the model: {num_labels}")

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of labels expected by the model: 2


In [14]:
def evaluate(model, df_test, model_type, labels_to_ids_mapping):

    test_dataset = DataSequence(df_test, labels_to_ids_mapping)
    
    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)
    
#     if model_type == "fine_tuned":
    if True:
        use_cuda = torch.cuda.is_available()
        device = torch.device("cuda" if use_cuda else "cpu")
    elif model_type == "standard":
        use_cuda = False
        device = torch.device("cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

#     first_done = False
    
    for test_data, test_label in test_dataloader:

#         if not first_done:
#             print(f"test_data: {test_data}")
#             print(f"test_label: {test_label}")
#             first_dont = True
        
        
        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)

        input_id = test_data['input_ids'].squeeze(1).to(device)

        if model_type == "fine_tuned":
            loss, logits = model(input_id, mask, test_label)
        elif model_type == "standard":
            outputs = model(input_id, attention_mask=mask, labels=test_label)
            logits = outputs.logits

        for i in range(logits.shape[0]):

            logits_clean = logits[i][test_label[i] != -100]
            label_clean = test_label[i][test_label[i] != -100]

            predictions = logits_clean.argmax(dim=1)
            acc = (predictions == label_clean).float().mean()
            total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {val_accuracy: .3f}')


evaluate(fine_tuned_model, df_test, "fine_tuned", labels_to_ids)
evaluate(standard_model, two_label_df_test, "standard", two_labels_to_ids)

Test Accuracy:  0.964
Test Accuracy:  0.619
