In [2]:
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import transformers
from transformers import AutoTokenizer
from transformers import  DistilBertForTokenClassification

from torch.optim import AdamW

import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

In [4]:
df = pd.read_csv("address_data.csv")

#change columns names
df.rename(columns = {'text':'sentence', 'labels':'tags'}, inplace = True)

#split train, dev , test sets
df_train, df_dev, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])


In [5]:
print('Train set:', df_train.shape)
print('Dev set:', df_dev.shape)
print('Test set:', df_test.shape)


Train set: (2000, 2)
Dev set: (250, 2)
Test set: (250, 2)


In [1]:
class DistilbertNER(nn.Module):
  """
  Input :
    - tokens_dim : number of unique labels in the dataset
  """
  
  def __init__(self, tokens_dim):
    super(DistilbertNER,self).__init__()
    
    if type(tokens_dim) != int:
            raise TypeError('Please tokens_dim should be an integer')

    if tokens_dim <= 0:
          raise ValueError('Classification layer dimension should be at least 1')

    self.pretrained = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = tokens_dim) #set the output of each token classifier = unique_lables


  def forward(self, input_ids, attention_mask, labels = None): #labels are needed in order to compute the loss
    """
  Forwad computation of the network
  Input:
    - inputs_ids : from model tokenizer
    - attention :  mask from model tokenizer
    - labels : if given the model is able to return the loss value
  """

    #inference time no labels
    if labels == None:
      out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
      return out

    out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
    return out

In [9]:
class NerDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    if not isinstance(df, pd.DataFrame):
      raise TypeError('Input should be a dataframe')
    
    if "tags" not in df.columns or "sentence" not in df.columns:
      raise ValueError("Dataframe should contain 'tags' and 'sentence' columns")

     
    
    tags_list = [i.split() for i in df["tags"].values.tolist()]
    texts = df["sentence"].values.tolist()

    self.texts = [tokenizer(text, padding = "max_length", truncation = True, return_tensors = "pt") for text in texts]
    self.labels = [match_tokens_labels(text, tags) for text,tags in zip(self.texts, tags_list)]

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_text = self.texts[idx]
    batch_labels = self.labels[idx]

    return batch_text, torch.LongTensor(batch_labels)

In [10]:
class MetricsTracking():
  def __init__(self):

    self.total_acc = 0
    self.total_f1 = 0
    self.total_precision = 0
    self.total_recall = 0

  def update(self, predictions, labels , ignore_token = -100):
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    predictions = predictions[labels != ignore_token]
    labels = labels[labels != ignore_token]

    predictions = predictions.to("cpu")
    labels = labels.to("cpu")

    acc = accuracy_score(labels,predictions)
    f1 = f1_score(labels, predictions, average = "macro")
    precision = precision_score(labels, predictions, average = "macro")
    recall = recall_score(labels, predictions, average = "macro")

    self.total_acc  += acc
    self.total_f1 += f1
    self.total_precision += precision
    self.total_recall  += recall

  def return_avg_metrics(self,data_loader_size):
    n = data_loader_size
    metrics = {
        "acc": round(self.total_acc / n ,3), 
        "f1": round(self.total_f1 / n, 3), 
        "precision" : round(self.total_precision / n, 3), 
        "recall": round(self.total_recall / n, 3)
          }
    return metrics   

In [11]:
def tags_2_labels(tags : str, tag2idx : dict):
  return [tag2idx[tag] if tag in tag2idx else unseen_label for tag in tags.split()] 


In [12]:
def tags_mapping(tags_series : pd.Series):

  if not isinstance(tags_series, pd.Series):
      raise TypeError('Input should be a padas Series')

  unique_tags = set()
  
  for tag_list in df_train["tags"]:
    for tag in tag_list.split():
      unique_tags.add(tag)


  tag2idx = {k:v for v,k in enumerate(sorted(unique_tags))}
  idx2tag = {k:v for v,k in tag2idx.items()}


  return tag2idx, idx2tag, unique_tags

In [13]:
def match_tokens_labels(tokenized_input, tags, ignore_token = -100):

        word_ids = tokenized_input.word_ids()

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(ignore_token)

            else :
                try:
                  reference_tag = tags[word_idx]
                  label_ids.append(tag2idx[reference_tag])
                except:
                  label_ids.append(ignore_token)
              
            
            previous_word_idx = word_idx

        return label_ids

In [15]:
def train_loop(model, train_dataset, dev_dataset, optimizer,  batch_size, epochs):
  
  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) : 
    
    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #train mode

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      '''
      squeeze in order to match the sizes. From [batch,1,seq_len] --> [batch,seq_len] 
      '''
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()
      
      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1) 

      #compute metrics
      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      #grad step
      loss.backward()
      optimizer.step()
    
    '''
    EVALUATION MODE
    '''            
    model.eval()

    dev_metrics = MetricsTracking()
    total_loss_dev = 0
    
    with torch.no_grad():
      for dev_data, dev_label in dev_dataloader:

        dev_label = dev_label.to(device)

        mask = dev_data['attention_mask'].squeeze(1).to(device)
        input_id = dev_data['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask, dev_label)
        loss, logits = output.loss, output.logits

        predictions = logits.argmax(dim= -1)     

        dev_metrics.update(predictions, dev_label)
        total_loss_dev += loss.item()
    
    train_results = train_metrics.return_avg_metrics(len(train_dataloader))
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" ) 
    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )   


In [16]:
#create tag-label mapping
tag2idx, idx2tag , unique_tags = tags_mapping(df_train["tags"])

#create the label column from tag. Unseen labels will be tagged as "O"
for df in [df_train, df_dev, df_test]:
  df["labels"] = df["tags"].apply(lambda tags : tags_2_labels(tags, tag2idx))

In [17]:
# tag2idx, idx2tag , unseen_label, unique_tags print
print(tag2idx)
print(idx2tag)
print(unique_tags)


{'area_locality_name': 0, 'city_town': 1, 'flat_apartment_number': 2, 'landmark': 3, 'society_name': 4, 'street': 5, 'sub_locality': 6}
{0: 'area_locality_name', 1: 'city_town', 2: 'flat_apartment_number', 3: 'landmark', 4: 'society_name', 5: 'street', 6: 'sub_locality'}
{'sub_locality', 'street', 'flat_apartment_number', 'city_town', 'area_locality_name', 'landmark', 'society_name'}


In [18]:
#original text
text = df_train["sentence"].values.tolist()

#toeknized text
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_tokenized = tokenizer(text , padding = "max_length" , truncation = True, return_tensors = "pt" )

#mapping token to original word
word_ids = text_tokenized.word_ids()

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
model = DistilbertNER(len(unique_tags))
#Prevent Catastrofic Forgetting
# model = freeze_model(model, num_layers = 2)

#datasets
train_dataset = NerDataset(df_train)
dev_dataset = NerDataset(df_dev)

lr = 1e-2
optimizer = SGD(model.parameters(), lr=lr, momentum = 0.9)  


#MAIN
parameters = {
    "model": model,
    "train_dataset": train_dataset,
    "dev_dataset" : dev_dataset,
    "optimizer" : optimizer,
    "batch_size" : 32,
    "epochs" : 15
}

train_loop(**parameters)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_sta

TRAIN 
Loss: 0.03646599742770195 
Metrics {'acc': 0.543, 'f1': 0.463, 'precision': 0.491, 'recall': 0.486}

VALIDATION 
Loss 0.02285837721824646 
Metrics{'acc': 0.738, 'f1': 0.662, 'precision': 0.743, 'recall': 0.659}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.017646216198801995 
Metrics {'acc': 0.799, 'f1': 0.749, 'precision': 0.783, 'recall': 0.749}

VALIDATION 
Loss 0.013862521409988404 
Metrics{'acc': 0.851, 'f1': 0.813, 'precision': 0.829, 'recall': 0.813}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.011832275010645389 
Metrics {'acc': 0.869, 'f1': 0.841, 'precision': 0.855, 'recall': 0.84}

VALIDATION 
Loss 0.013167136549949646 
Metrics{'acc': 0.865, 'f1': 0.827, 'precision': 0.868, 'recall': 0.823}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.008977713987231255 
Metrics {'acc': 0.901, 'f1': 0.882, 'precision': 0.891, 'recall': 0.88}

VALIDATION 
Loss 0.010438865184783935 
Metrics{'acc': 0.892, 'f1': 0.87, 'precision': 0.876, 'recall': 0.869}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0067182044275105 
Metrics {'acc': 0.929, 'f1': 0.915, 'precision': 0.919, 'recall': 0.914}

VALIDATION 
Loss 0.009935910046100617 
Metrics{'acc': 0.902, 'f1': 0.882, 'precision': 0.886, 'recall': 0.884}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.005595687322318554 
Metrics {'acc': 0.941, 'f1': 0.93, 'precision': 0.934, 'recall': 0.93}

VALIDATION 
Loss 0.010140681564807892 
Metrics{'acc': 0.904, 'f1': 0.885, 'precision': 0.889, 'recall': 0.885}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.004512226387858391 
Metrics {'acc': 0.953, 'f1': 0.945, 'precision': 0.947, 'recall': 0.945}

VALIDATION 
Loss 0.009088603138923646 
Metrics{'acc': 0.915, 'f1': 0.896, 'precision': 0.91, 'recall': 0.89}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.003613315608352423 
Metrics {'acc': 0.961, 'f1': 0.954, 'precision': 0.956, 'recall': 0.953}

VALIDATION 
Loss 0.0096691215634346 
Metrics{'acc': 0.918, 'f1': 0.9, 'precision': 0.907, 'recall': 0.898}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0032197951450943945 
Metrics {'acc': 0.965, 'f1': 0.959, 'precision': 0.96, 'recall': 0.96}

VALIDATION 
Loss 0.009973023951053619 
Metrics{'acc': 0.916, 'f1': 0.896, 'precision': 0.9, 'recall': 0.895}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0027574150562286377 
Metrics {'acc': 0.971, 'f1': 0.966, 'precision': 0.968, 'recall': 0.966}

VALIDATION 
Loss 0.009825020849704743 
Metrics{'acc': 0.92, 'f1': 0.901, 'precision': 0.91, 'recall': 0.897}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0028076435066759584 
Metrics {'acc': 0.97, 'f1': 0.965, 'precision': 0.966, 'recall': 0.965}

VALIDATION 
Loss 0.010311530470848084 
Metrics{'acc': 0.911, 'f1': 0.896, 'precision': 0.896, 'recall': 0.9}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0026448359694331886 
Metrics {'acc': 0.971, 'f1': 0.966, 'precision': 0.967, 'recall': 0.967}

VALIDATION 
Loss 0.009521418631076813 
Metrics{'acc': 0.926, 'f1': 0.91, 'precision': 0.916, 'recall': 0.907}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0021692457459867 
Metrics {'acc': 0.976, 'f1': 0.972, 'precision': 0.973, 'recall': 0.972}

VALIDATION 
Loss 0.009679481744766235 
Metrics{'acc': 0.923, 'f1': 0.906, 'precision': 0.907, 'recall': 0.907}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0017166673401370644 
Metrics {'acc': 0.982, 'f1': 0.979, 'precision': 0.979, 'recall': 0.979}

VALIDATION 
Loss 0.010007084131240844 
Metrics{'acc': 0.929, 'f1': 0.914, 'precision': 0.912, 'recall': 0.92}



100%|██████████| 63/63 [00:51<00:00,  1.23it/s]


TRAIN 
Loss: 0.0015215562703087926 
Metrics {'acc': 0.985, 'f1': 0.982, 'precision': 0.982, 'recall': 0.982}

VALIDATION 
Loss 0.011374484479427337 
Metrics{'acc': 0.921, 'f1': 0.905, 'precision': 0.905, 'recall': 0.908}



In [24]:
import torch

# Define the file path where you want to save the model
model_save_path = "distilbert_ner_model_meta.pth"

# Define all important metadata
metadata = {
    "unique_tags": unique_tags,  # Assuming unique_tags is defined in your context
    "tag2idx": tag2idx,
    "idx2tag": idx2tag
    # Add any other metadata you want to save
}

# Create a dictionary to save the model and metadata
model_data = {
    "model_state_dict": model.state_dict(),
    "metadata": metadata,
}

# Save the model data
torch.save(model_data, model_save_path)

# Print a message to indicate that the model and metadata are saved
print(f"Model and metadata saved at {model_save_path}")


Model and metadata saved at distilbert_ner_model_meta.pth
