<a href="https://colab.research.google.com/github/peravali810/FinetuningLLMs/blob/main/NER_using_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
!pip install -q transformers accelerate evaluate datasets seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [37]:
import pandas as pd
import json
import requests

#Loading dataset

##Loading train dataset

In [38]:
res = requests.get('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio')
res = res.text.splitlines()

trainTokens = []
trainTags = []

tempTokens = []
tempTags = []

for line in res:
  if line != '':
    tag, token = line.strip().split("\t")
    tempTags.append(tag)
    tempTokens.append(token)
  else:
    trainTokens.append(tempTokens)
    trainTags.append(tempTags)

    tempTokens =[]
    tempTags = []

len(trainTokens), len(trainTags)


(7659, 7659)

##Loading test and validation dataset

In [39]:
res = requests.get('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/test.bio')
res = res.text.splitlines()

testTokens = []
testTags = []

tempTokens = []
tempTags = []

for line in res:
  if line != '':
    tag, token = line.strip().split("\t")
    tempTags.append(tag)
    tempTokens.append(token)
  else:
    testTokens.append(tempTokens)
    testTags.append(tempTags)

    tempTokens =[]
    tempTags = []

len(testTokens), len(testTags)


(1520, 1520)

#Creating Huggingface Dataset

In [56]:
from datasets import Dataset, DatasetDict

df = pd.DataFrame({"tokens": trainTokens, "tags": trainTags})
trainData = Dataset.from_pandas(df)

df = pd.DataFrame({"tokens": testTokens, "tags": testTags})
testData = Dataset.from_pandas(df)

dataset = DatasetDict({'train':trainData,
                      'test': testData,
                       'validation': testData}
)

dataset


DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 1520
    })
})

In [57]:
uniqueTags = set()

for tag in dataset['train']['tags']:
  uniqueTags.update(tag)

uniqueTags = [x[2:] for x in list(uniqueTags) if x!='O']
uniqueTags = list(set(uniqueTags))
uniqueTags

['Location',
 'Hours',
 'Price',
 'Restaurant_Name',
 'Dish',
 'Rating',
 'Cuisine',
 'Amenity']

In [58]:
tag2index = {"O":0}

for i, tag in enumerate(uniqueTags):
  tag2index[f'B-{tag}'] = len(tag2index)
  tag2index[f'I-{tag}'] = len(tag2index)

tag2index

{'O': 0,
 'B-Location': 1,
 'I-Location': 2,
 'B-Hours': 3,
 'I-Hours': 4,
 'B-Price': 5,
 'I-Price': 6,
 'B-Restaurant_Name': 7,
 'I-Restaurant_Name': 8,
 'B-Dish': 9,
 'I-Dish': 10,
 'B-Rating': 11,
 'I-Rating': 12,
 'B-Cuisine': 13,
 'I-Cuisine': 14,
 'B-Amenity': 15,
 'I-Amenity': 16}

In [59]:
index2tag = {v:k for k,v in tag2index.items()}
index2tag

{0: 'O',
 1: 'B-Location',
 2: 'I-Location',
 3: 'B-Hours',
 4: 'I-Hours',
 5: 'B-Price',
 6: 'I-Price',
 7: 'B-Restaurant_Name',
 8: 'I-Restaurant_Name',
 9: 'B-Dish',
 10: 'I-Dish',
 11: 'B-Rating',
 12: 'I-Rating',
 13: 'B-Cuisine',
 14: 'I-Cuisine',
 15: 'B-Amenity',
 16: 'I-Amenity'}

In [60]:
dataset = dataset.map(lambda x: {'nerTags': [tag2index[tag] for tag in x['tags']]})
dataset

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'nerTags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'tags', 'nerTags'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'tags', 'nerTags'],
        num_rows: 1520
    })
})

#Building model

In [61]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [62]:
model_cp = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_cp)

In [63]:
input = dataset['train'][0]['tokens']
output = tokenizer(input, is_split_into_words=True)
output

{'input_ids': [101, 123, 1838, 7724, 1114, 1656, 7659, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

##Aligning labels with tokens

In [64]:
def tokenizeAlign(data):
  tokenizedInputs = tokenizer(data['tokens'], truncation=True, is_split_into_words=True)

  labels = []

  for i , label in enumerate(data['nerTags']):
    wordIds = tokenizedInputs.word_ids(batch_index=i)

    prevWordId = None
    curLabelId = []
    for idx in wordIds:
      if idx is None:
        curLabelId.append(-100)
      elif idx != prevWordId:
        curLabelId.append(label[idx])
      else:
        curLabelId.append(-100)

      prevWordId = idx

    labels.append(curLabelId)

  tokenizedInputs['labels'] = labels

  return tokenizedInputs

In [65]:
tokenDataset = dataset.map(tokenizeAlign, batched=True)
tokenDataset

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'nerTags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'tags', 'nerTags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'tags', 'nerTags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1520
    })
})

In [66]:
tokenDataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'tags': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'nerTags': [11, 12, 0, 1, 2, 2],
 'input_ids': [101, 126, 2851, 1832, 4084, 5240, 1107, 1139, 1411, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 11, 12, 0, -100, -100, 1, 2, 2, -100]}

#Data collator and token merics

In [73]:
from transformers import DataCollatorForTokenClassification

dataCollator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [81]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")
label_names = list(tag2index)

def computeMetrics(evalPred):
  preds, labels = evalPred

  predictions = np.argmax(preds, axis=-1)
  true_labels = [[label_names[l] for l in label if l != -100]for label in labels]
  pred_labels = [[label_names[p] for p, l in zip(prediction, label) if l != -100]for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=pred_labels, references=true_labels)

  return{
      'precision': all_metrics['overall_precision'],
      'recall': all_metrics['overall_recall'],
      'f1': all_metrics['overall_f1'],
      'accuracy': all_metrics['overall_accuracy']
  }

#Training model

In [82]:
model = AutoModelForTokenClassification.from_pretrained(model_cp, num_labels=len(tag2index), id2label=index2tag, label2id=tag2index)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
args = TrainingArguments(
    output_dir='distilNER',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none'

)



In [84]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenDataset['train'],
    eval_dataset=tokenDataset['validation'],
    tokenizer=tokenizer,
    data_collator=dataCollator,
    compute_metrics=computeMetrics
)

  trainer = Trainer(


In [85]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5998,0.303574,0.738846,0.788571,0.762899,0.909882
2,0.2365,0.285903,0.771194,0.805714,0.788076,0.915497
3,0.1926,0.289064,0.778324,0.804762,0.791322,0.917602


TrainOutput(global_step=2874, training_loss=0.3010021095567888, metrics={'train_runtime': 208.7972, 'train_samples_per_second': 110.045, 'train_steps_per_second': 13.765, 'total_flos': 111669014361402.0, 'train_loss': 0.3010021095567888, 'epoch': 3.0})

In [86]:
trainer.save_model('distilNER')