In [13]:
%pip install tqdm

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import numpy as np
import torch

from torch.utils.data import DataLoader
from torch.optim import SGD
from transformers import BertTokenizerFast
from tqdm import tqdm

In [15]:
NER_DATASETS_DIRECTORY = "kaggle/input/"
STANDARD_NER_DIRECTORY = "standard_ner_dataset/"

# INDIAN_LEGAL_NER_DIRECTORY = "Indian_Legal_NER_Dataset/"

# NER_DATASETS_DIRECTORY = "All_Data/NER_Datasets/"
# STANDARD_NER_DIRECTORY = "Standard_NER_Dataset/"
# INDIAN_LEGAL_NER_DIRECTORY = "Indian_Legal_NER_Dataset/"

In [16]:
!pwd
%cd /media/akheel/Windows-SSD/Users/akhee/Documents/Projects/NITK/Major Project/LegalDoc-Retrieval-n-Summarization/

/media/akheel/Windows-SSD/Users/akhee/Documents/Projects/NITK/Major Project/LegalDoc-Retrieval-n-Summarization
/media/akheel/Windows-SSD/Users/akhee/Documents/Projects/NITK/Major Project/LegalDoc-Retrieval-n-Summarization


In [17]:
# df = pd.read_csv('/kaggle/input/standard-ner-dataset/standard_NER.csv')
df = pd.read_csv('All_Data/NER_Datasets/Standard_NER_Dataset/standard_NER.csv')
print(f"df.shape: {df.shape}")
df.head()

df.shape: (47959, 2)


Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


**There exists 9 entity categories:**
- geo for geographical entity
- org for organization entity
- per for person entity
- gpe for geopolitical entity
- tim for time indicator entity
- art for artifact entity
- eve for event entity
- nat for natural phenomenon entity
- O is assigned if a word doesn’t belong to any entity.

**Each except O has a beginning and Intermediate tag as well. Bringing the total to 17 categories of tagging**

In [18]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]
 
print(unique_labels)

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

print(labels_to_ids)

{'I-org', 'I-eve', 'B-nat', 'I-gpe', 'I-art', 'I-tim', 'O', 'B-gpe', 'B-art', 'B-per', 'B-org', 'I-per', 'B-tim', 'I-nat', 'B-geo', 'I-geo', 'B-eve'}
{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


In [19]:
num_labels = len(labels_to_ids)
print(num_labels)

17


**We define a DataSequence Class**
- This class, when initialized, will split the labels and all texts into their own lists.
- It contains texts that is essentially each text, tokenized, with 512 max_length
- Labels are made from each label list, but are corrected for the subword tokenizing BERT performs, as well as padding
    - If a word does not have an idx, its label id is -100
    - If a word is part of / following a previous word / is essentially a subword token, it is given the same id as previous.
    - If a word is new, it is given the same id it currently has

In [20]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def align_label(texts, labels):
  tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
  word_ids = tokenized_inputs.word_ids()
  previous_word_idx = None
  label_ids = []
  for word_idx in word_ids:
    if word_idx is None:
      label_ids.append(-100)
    elif word_idx != previous_word_idx:
      try:
        label_ids.append(labels_to_ids[labels[word_idx]])
      except:
        label_ids.append(-100)
    else:
      try:
        label_ids.append(labels_to_ids[labels[word_idx]])
      except:
        label_ids.append(-100)
    previous_word_idx = word_idx
  return label_ids


class DataSequence(torch.utils.data.Dataset):
  def __init__(self, df):
    lb = [i.split() for i in df['labels'].values.tolist()]
    txt = df['text'].values.tolist()
    self.texts = [tokenizer(str(i), padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
    self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

  def __len__(self):
    return len(self.labels)
  
  def get_batch_data(self, idx):
    return self.texts[idx]
  
  def get_batch_labels(self, idx):
    return torch.LongTensor(self.labels[idx])
  
  def __getitem__(self, idx):
    batch_data = self.get_batch_data(idx)
    batch_labels = self.get_batch_labels(idx)
    return batch_data, batch_labels

**With the DataSequence Class Defined, We can split the actual Data**  
For building model we just use 1000

In [21]:
df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df)), int(.9 * len(df))])
# print(f"df_train.shape: {df_train.shape}")
# print(f"df_val.shape: {df_val.shape}")
# print(f"df_test.shape: {df_test.shape}")

  return bound(*args, **kwds)


## Model Building

In [22]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

  def __init__(self):
    super(BertModel, self).__init__()
    self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
  
  def forward(self, input_id, mask, label):
    output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
    return output

## Model Training

In [23]:
import os

# To prevent parallelization warnings set true for parallelizing or false for not
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [24]:
def train_loop(model, df_train, df_val):

  train_dataset = DataSequence(df_train)
  val_dataset = DataSequence(df_val)

  train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
  val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

  if use_cuda:
    print("Im using Cudaa")
    model = model.cuda()

  best_acc = 0
  best_loss = 1000

  for epoch_num in range(EPOCHS):

    total_acc_train = 0
    total_loss_train = 0

    model.train()

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)
      
      optimizer.zero_grad()
      loss, logits = model(input_id, mask, train_label)
      
      for i in range(logits.shape[0]):
        
        logits_clean = logits[i][train_label[i] != -100]
        label_clean = train_label[i][train_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_train += acc
        total_loss_train += loss.item()
        
      loss.backward()
      optimizer.step()

    model.eval()

    total_acc_val = 0
    total_loss_val = 0

    for val_data, val_label in val_dataloader:

      val_label = val_label.to(device)
      mask = val_data['attention_mask'].squeeze(1).to(device)
      input_id = val_data['input_ids'].squeeze(1).to(device)

      loss, logits = model(input_id, mask, val_label)

      for i in range(logits.shape[0]):

        logits_clean = logits[i][val_label[i] != -100]
        label_clean = val_label[i][val_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_val += acc
        total_loss_val += loss.item()

    val_accuracy = total_acc_val / len(df_val)
    val_loss = total_loss_val / len(df_val)

    print(f"Epochs: {epoch_num + 1} | "
          f"Loss: {total_loss_train / len(df_train): .3f} | "
          f"Accuracy: {total_acc_train / len(df_train): .3f} | "
          f"Val_Loss: {total_loss_val / len(df_val): .3f} | "
          f"Accuracy: {total_acc_val / len(df_val): .3f}")

LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 5

model = BertModel()
train_loop(model, df_train, df_val)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█▎        | 21/160 [03:13<21:18,  9.20s/it]


KeyboardInterrupt: 

### Saving Model

In [None]:
# # For tensor flow
# # Save the model architecture to a JSON file
# model_json = model.to_json()
# with open("bert_model.json", "w") as json_file:
#     json_file.write(model_json)

# # Save the model weights to a separate file
# model.save_weights("bert_model_weights.h5")

# For PyTorch
# Save the entire model (including architecture and weights)
torch.save(model, "NER_With_Bert_5Epoch_NotFullData_17Batch.pth")

# model.export('NER_With_Bert_5Epoch_NotFullData.pkl')

### Loading Model Back

In [None]:
import torch

# Load the entire PyTorch model
model = torch.load("/kaggle/input/ner-with-bert-models/NER_With_Bert_5Epoch_NotFullData.pth")


## Model Evaluating

In [None]:
def evaluate(model, df_test):

  test_dataset = DataSequence(df_test)

  test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  total_acc_test = 0.0

  for test_data, test_label in test_dataloader:

    test_label = test_label.to(device)
    mask = test_data['attention_mask'].squeeze(1).to(device)
    
    input_id = test_data['input_ids'].squeeze(1).to(device)
    
    loss, logits = model(input_id, mask, test_label)
    
    for i in range(logits.shape[0]):
      
      logits_clean = logits[i][test_label[i] != -100]
      label_clean = test_label[i][test_label[i] != -100]
      
      predictions = logits_clean.argmax(dim=1)
      acc = (predictions == label_clean).float().mean()
      total_acc_test += acc

  val_accuracy = total_acc_test / len(df_test)
  print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(model, df_test)

## Model Usage Testing

In [None]:
def align_word_ids(texts):
  
  tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

  word_ids = tokenized_inputs.word_ids()

  previous_word_idx = None
  label_ids = []

  for word_idx in word_ids:

    if word_idx is None:
      label_ids.append(-100)
        
    elif word_idx != previous_word_idx:
      try:
        label_ids.append(1)
      except:
        label_ids.append(-100)
    else:
      try:
        label_ids.append(1)
      except:
        label_ids.append(-100)
    previous_word_idx = word_idx

  return label_ids


def evaluate_one_text(model, sentence):

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

  mask = text['attention_mask'].to(device)
  input_id = text['input_ids'].to(device)
  label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

  logits = model(input_id, mask, None)
  logits_clean = logits[0][label_ids != -100]

  predictions = logits_clean.argmax(dim=1).tolist()
  prediction_label = [ids_to_labels[i] for i in predictions]
  print(sentence)
  print(prediction_label)

  return prediction_label
            
evaluate_one_text(model, 'Bill Gates is the founder of Microsoft')

In [None]:
SAMPLE_DOCUMENT_SUMMARIES_DIR = '/kaggle/input/sample-document-summaries-for-skw-dataset/'
INDIAN_SUMMARY_FILE = '1.txt'
UK_SUMMARY_FILE = 'uksc-2009-0019.txt'

In [None]:
summaryFilePaths = [SAMPLE_DOCUMENT_SUMMARIES_DIR + fileName for fileName in [INDIAN_SUMMARY_FILE, UK_SUMMARY_FILE]]

summaries = []
for path in summaryFilePaths:
    with open(path, 'r', encoding='utf-8') as file:
        summary = file.read()
        summaries.append(summary)

labels = []
for summary in summaries:
    labels.append(evaluate_one_text(model, summary))

In [None]:
# summary = summaries[0]
# summary_tokens = tokenizer(summary, padding='max_length', max_length=512, truncation=True)
# print(summary_tokens["input_ids"])

In [None]:
# summary = summaries[1]
# summary_tokens = tokenizer(summary, padding='max_length', max_length=512, truncation=True)
# print(summary_tokens["input_ids"])
# important_ids = [i for i in range(len(label)) if label[i] != 'O']
# important_token_ids = [input_id for i, input_id in enumerate(summary_tokens["input_ids"]) if i in set(important_ids)]
# print(important_token_ids)

In [None]:
# print(len(summaries))
# print(len(labels))
# print(summaries)
# print(labels)

# for summary, label in zip(summaries, labels):
#     summary_tokens = tokenizer(summary, padding='max_length', max_length=512, truncation=True)
#     important_ids = [i for i in range(len(label)) if label[i] != 'O']
#     important_token_ids = [input_id for i, input_id in enumerate(summary_tokens["input_ids"]) if i in set(important_ids)]
# #     important_tokens = [summary_tokens[i] for i in important_ids]
#     important_tokens = tokenizer.convert_ids_to_tokens(important_token_ids)
#     print(len(summary_tokens))
#     print(len(label))
#     print(important_tokens)
#     print(important_ids)

In [None]:
# for i, (summary, label) in enumerate(zip(summaries, labels)):
#     print(label)
#     summary_tokens = tokenizer(summary, padding='max_length', max_length=512, truncation=True)
#     important_ids = [i for i in range(len(label)) if label[i] != 'O']
#     print(important_ids)
#     important_token_ids = [input_id for i, input_id in enumerate(summary_tokens["input_ids"]) if i in set(important_ids)]
#     important_tokens = tokenizer.convert_ids_to_tokens(important_token_ids)
#     print(f"{i}: {important_tokens}")

### Retrieving Non-'O' Words

In [None]:
for summary, token_labels in zip(summaries, labels):
    # Retrieve non-'O' labeled words and their corresponding tokens
    non_o_tokens = []
    current_word_tokens = []
    
    summary_token_ids = tokenizer(summary, padding='max_length', max_length=512, truncation=True)
    summary_tokens = tokenizer.convert_ids_to_tokens(summary_token_ids['input_ids'][1:])
    
    for token, token_id, token_label in zip(summary_tokens, summary_token_ids["input_ids"][1:], token_labels):
        print(token, token_id, token_label)
        
#     print(summary_tokens)
#     print(summary_token_ids['input_ids'])
#     print(token_labels)

    for token_id, label in zip(summary_token_ids['input_ids'][1:], token_labels):
        token_text = tokenizer.decode(token_id)

        # Handle subword tokens
        if token_text.startswith("##"):
            if current_word_tokens:
                current_word_tokens[-1] += token_text[2:]
        else:
            if label != 'O':
                current_word_tokens.append(token_text)
            else:
                if current_word_tokens:
                    non_o_tokens.append(" ".join(current_word_tokens))
                    current_word_tokens = []

    # Check the last word if it's non-'O'
    if current_word_tokens:
        non_o_tokens.append(" ".join(current_word_tokens))

    # non_o_tokens now contains all non-'O' labeled words, accounting for subword tokens
    print("Important Words: ", non_o_tokens)