In [2]:
#toeknized text
import transformers
from transformers import AutoTokenizer
from transformers import  DistilBertForTokenClassification

import torch
import torch.nn as nn

import torch
import re

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [3]:
class DistilbertNER(nn.Module):
  
  def __init__(self, tokens_dim):
    super(DistilbertNER,self).__init__()
    
    if type(tokens_dim) != int:
            raise TypeError('Please tokens_dim should be an integer')

    if tokens_dim <= 0:
          raise ValueError('Classification layer dimension should be at least 1')

    self.pretrained = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = tokens_dim) #set the output of each token classifier = unique_lables


  def forward(self, input_ids, attention_mask, labels = None): #labels are needed in order to compute the loss
    #inference time no labels
    if labels == None:
      out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
      return out

    out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
    return out

In [4]:
# Define the file path where the model is saved
model_save_path = "distilbert_ner_model_meta.pth"

# Load the model data
model_data = torch.load(model_save_path, map_location=torch.device('cpu'))

# Extract the model's state dictionary and metadata
model_state_dict = model_data["model_state_dict"]
metadata = model_data["metadata"]
idx2tag = metadata["idx2tag"]
tag2idx = metadata["tag2idx"]

# Load the model class
model = DistilbertNER(len(metadata["unique_tags"])) 

# Load the model's state dictionary
model.load_state_dict(model_state_dict)

print("Model and metadata loaded successfully.")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Model and metadata loaded successfully.


In [5]:
import torch
import re

def align_word_ids(texts):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

def evaluate_one_text(model, sentence):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()
    sentence_processed, pincode = preprocess_user_input(sentence)
    sentence =  sentence_processed

    text = tokenizer(sentence, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [idx2tag[i] for i in predictions]

    if pincode:
        prediction_label.append('pincode')
        

    print(sentence_processed)
    print(prediction_label)

def preprocess_user_input(sentence):
    # Lowercase the input
    sentence = sentence.lower()
    # Remove commas
    sentence = sentence.replace(',', '')
    # Extract pincode using regex
    pincode = re.findall(r'\b\d{6}\b', sentence)
    # Remove pincode from the sentence
    sentence = re.sub(r'\b\d{6}\b', '', sentence)
    # Join the remaining text
    sentence = ' '.join(sentence.split())
    return sentence, pincode

# Example usage:
sentence = "21, Kartavya Path, near chandi chowk,Delhi 110001"
evaluate_one_text(model, sentence)


21 kartavya path near chandi chowkdelhi
['flat_apartment_number', 'street', 'street', 'landmark', 'landmark', 'landmark', 'pincode']
