<a href="https://colab.research.google.com/github/pelinbalci/LLM_Notebooks/blob/main/LLM_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install torch
!pip install transformers[torch]
!pip install accelerate
!pip install datasets
! pip install -U accelerate
! pip install -U transformers

Collecting transformers
  Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.28.0
    Uninstalling transformers-4.28.0:
      Successfully uninstalled transformers-4.28.0
Successfully installed transformers-4.31.0


In [2]:
import pandas as pd
from tqdm import tqdm
import torch
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, TensorDataset
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig


# Load Data from Excel

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

dashline = '-'*30

# Load the Excel file into a DataFrame
df = pd.read_excel('/content/nerdata.xlsx')


In [5]:
# Create a dict for dataset
raw_data_dict = {}
for idx in list(set(df.Sentence_ID.values)):
    sentence = df[df.Sentence_ID == idx]
    raw_data_dict[idx] = {}
    raw_data_dict[idx]['words'] = list(sentence.Words.values)
    raw_data_dict[idx]['original_labels'] = list(sentence.Labels.values)
    raw_data_dict[idx]['ner_tags'] = list(sentence.ner_tags.values)
print('raw_data: ', raw_data_dict)
print(dashline)

raw_data:  {1: {'words': ['I', 'Love', 'you'], 'original_labels': ['B-Sub', 'B-Verb', 'B-Obj'], 'ner_tags': [1, 3, 5]}, 2: {'words': ['You ', 'and', 'Me', 'are ', 'going ', 'to ', 'the ', 'mall', 'today'], 'original_labels': ['B-Sub', 'O', 'I-Sub', 'B-Verb', 'I-Verb', 'O', 'O', 'B-Obj', 'I-Obj'], 'ner_tags': [1, 0, 2, 3, 4, 0, 0, 5, 6]}, 3: {'words': ['When', 'what', 'how', 'are ', 'some ', 'of', 'the ', 'question', 'words', 'in ', 'English'], 'original_labels': ['B-Sub', 'I-Sub', 'I-Sub', 'B-Verb', 'O', 'O', 'O', 'B-Obj', 'I-Obj', 'O', 'I-Obj'], 'ner_tags': [1, 2, 2, 3, 0, 0, 0, 5, 6, 0, 6]}, 4: {'words': ['Jane', 'and', 'I ', 'will', 'go ', 'to ', 'the ', 'cinema', 'today'], 'original_labels': ['B-Sub', 'O', 'I-Sub', 'B-Verb', 'I-Verb', 'O', 'O', 'B-Obj', 'I-Obj'], 'ner_tags': [1, 0, 2, 3, 4, 0, 0, 5, 6]}, 5: {'words': ['Here', 'is ', 'a ', 'new', 'thought', 'I ', 'do ', 'not', 'like ', 'to ', 'learn', 'spanish'], 'original_labels': ['O', 'B-Verb', 'O', 'B-Obj', 'I-Obj', 'B-Sub', 'B-

In [6]:
from datasets import Dataset, DatasetDict
# Convert raw_data to a list of dictionaries
data_list = []
for idx, data in raw_data_dict.items():
    data_list.append({
        'id': idx,
        'words': data['words'],
        'ner_tags': data['ner_tags'],
        'pos_tags': [],  # Placeholder, as your data doesn't have pos_tags
        'chunk_tags': []  # Placeholder, as your data doesn't have chunk_tags
    })

In [7]:
# Convert the list to a Hugging Face Dataset
train_dataset = Dataset.from_dict({k: [d[k] for d in data_list] for k in data_list[0]})

# Create a DatasetDict
raw_data = DatasetDict({"train": train_dataset})
print("DatasetDict: ", raw_data)
print(dashline)

DatasetDict:  DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags', 'pos_tags', 'chunk_tags'],
        num_rows: 6
    })
})
------------------------------


#Get labels

In [8]:
# Get labels
label_ids = list(set(df.Labels))
label2id = {label: id for id, label in enumerate(label_ids)}
id2label = {id: label for label, id in label2id.items()}

# Tokenization

In [9]:
# Tokenize a word - Example
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
inputs = tokenizer(raw_data["train"][0]["words"], is_split_into_words=True)
print(inputs.tokens())   # ['[CLS]', 'I', 'Love', 'you', '[SEP]']
print(inputs.word_ids())  # [None, 0, 1, 2, None]
print(dashline)


['[CLS]', 'I', 'Love', 'you', '[SEP]']
[None, 0, 1, 2, None]
------------------------------


In [10]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"], truncation=True, padding=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs



In [11]:
# Example:

ner_tags = raw_data["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print("Example")
print("labels:", ner_tags)
print("new_labels:", align_labels_with_tokens(ner_tags, word_ids))
print(dashline)


Example
labels: [1, 3, 5]
new_labels: [-100, 1, 3, 5, -100]
------------------------------


In [12]:
# Apply to all
tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names,
)

print(tokenized_datasets)
print(dashline)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6
    })
})
------------------------------


#Data Collector

In [13]:
# Data Collector
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
print("Data collector example")
print(batch["labels"])
print(dashline)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Data collector example
tensor([[-100,    1,    3,    5, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    1,    0,    2,    3,    4,    0,    0,    5,    6, -100, -100,
         -100, -100, -100]])
------------------------------


# Train

In [14]:
# TRain

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-cased",
    num_labels=len(label_ids),
    id2label=id2label,
    label2id=label2id,
)

from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.001,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model("./models_2345")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


# Pipeline

In [15]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained("./models_2345")

# Single prediction
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")
single_test_text = ["I", "think", "you"]
print(nlp(single_test_text))

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


[[{'entity_group': 'Verb', 'score': 0.24876964, 'word': 'I', 'start': 0, 'end': 1}], [{'entity_group': 'Sub', 'score': 0.27963677, 'word': 'think', 'start': 0, 'end': 5}], [{'entity_group': 'Verb', 'score': 0.24145265, 'word': 'you', 'start': 0, 'end': 3}]]
