In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
#!pip install datasets

In [3]:
df = pd.read_csv('../data/ner.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [5]:
df.labels = df.labels.str.replace('B-tim', 'O').str.replace('I-tim', 'O').str.replace('B-art','O').str.replace('I-art', 'O').str.replace('B-nat', 'O').str.replace('I-nat','O')

In [6]:
from datasets import Dataset
from transformers import BertTokenizerFast

# Use the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

texts = df['text'].tolist()  # Text sequences
labels_text = df['labels'].tolist()  # Corresponding labels
texts_split = [text.split() for text in texts]  # Tokenized text into words

# Create a set of unique labels and map them to indices
set_labels = set([label for label_seq in labels_text for label in label_seq.split()])
num_labels = len(set_labels)

dict_labels = {label: idx for idx, label in enumerate(set_labels)}  # Label -> index mapping

# Function to tokenize and align labels
def tokenize_and_align_labels(texts, labels, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, max_length=128, is_split_into_words=True, truncation=True, padding=True)
    word_ids = tokenized_inputs.word_ids()  # Get the word indices
    previous_word_idx = None
    label_ids = []

    # Align the labels with the tokenized inputs
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # Ignore subword tokens
        elif word_idx != previous_word_idx:
            label_ids.append(dict_labels[labels[word_idx]])  # Assign label to the first wordpiece
        else:
            label_ids.append(dict_labels[labels[word_idx]] if label_all_tokens else -100)  # Option to propagate label to subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Initialize lists to collect tokenized inputs
input_ids = []
attention_mask = []
labels = []

# Tokenize each sample and align its labels
for i in range(len(texts_split)):
    out = tokenize_and_align_labels(texts_split[i], labels_text[i].split())
    input_ids.append(out['input_ids'])
    attention_mask.append(out['attention_mask'])
    labels.append(out['labels'])

# Convert the merged tokenized inputs into a Dataset object
dataset_dict = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': labels
}
dataset = Dataset.from_dict(dataset_dict)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)



In [7]:
import mlflow
import torch
from transformers import BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
# %env CLEARML_WEB_HOST=...
# %env CLEARML_API_HOST=...
# %env CLEARML_FILES_HOST=...
# %env CLEARML_API_ACCESS_KEY=...
# %env CLEARML_API_SECRET_KEY=...
# End the previous run
mlflow.end_run()

# Load pre-trained BERT model with classification head
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize the data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,              # Pass the tokenizer
    padding=True,                     # Enable dynamic padding
    max_length=128,                   # Set max length
    label_pad_token_id=-100           # Use -100 to ignore padding labels
)



# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',               # Output directory for model predictions and checkpoints
    num_train_epochs=3,                   # Total number of training epochs
    per_device_train_batch_size=16,       # Batch size per device during training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    evaluation_strategy="epoch",          # Evaluate every epoch
    save_steps=10_000,                    # Save checkpoint every 10,000 steps
    save_total_limit=2,                   # Limit the total number of checkpoints
    logging_dir='./logs',                 # Directory for storing logs
    learning_rate=2e-5,                   # Learning rate
    weight_decay=0.01,
    disable_tqdm=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator          # Data collator for padding and aligning
)

# Train the model
trainer.train()
trainer.save_model(output_dir='../models')  # Save model to specified directory
tokenizer.save_pretrained('../models')

env: CLEARML_WEB_HOST=https://app.clear.ml/
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=GI4KIABJ5UAZCH598L1KWO133ME9UC
env: CLEARML_API_SECRET_KEY=qteuth7eApbpYZZB84HVzybehABY0b1JOH7sOu2YkDhfOMcinXIE3KhO1fXg74tyBWA


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


ClearML Task: overwriting (reusing) task id=789a4a509ef04c96918d822e8aa8f550
2024-10-05 16:56:47,129 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/a46d6e6b310241b5b88cffd808d75916/experiments/789a4a509ef04c96918d822e8aa8f550/output/log


Unsupported key of type '<class 'int'>' found when connecting dictionary. It will be converted to str

`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To p

{'loss': 0.225, 'learning_rate': 1.8609952738393106e-05, 'epoch': 0.21}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.1248, 'learning_rate': 1.721990547678621e-05, 'epoch': 0.42}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.1217, 'learning_rate': 1.5829858215179316e-05, 'epoch': 0.63}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.1143, 'learning_rate': 1.4439810953572422e-05, 'epoch': 0.83}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'eval_loss': 0.10268909484148026, 'eval_runtime': 44.7016, 'eval_samples_per_second': 214.579, 'eval_steps_per_second': 13.422, 'epoch': 1.0}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.1078, 'learning_rate': 1.3049763691965527e-05, 'epoch': 1.04}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0925, 'learning_rate': 1.1659716430358635e-05, 'epoch': 1.25}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0912, 'learning_rate': 1.026966916875174e-05, 'epoch': 1.46}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0836, 'learning_rate': 8.879621907144844e-06, 'epoch': 1.67}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0807, 'learning_rate': 7.4895746455379494e-06, 'epoch': 1.88}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'eval_loss': 0.09911461174488068, 'eval_runtime': 43.276, 'eval_samples_per_second': 221.647, 'eval_steps_per_second': 13.865, 'epoch': 2.0}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0779, 'learning_rate': 6.099527383931054e-06, 'epoch': 2.09}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0687, 'learning_rate': 4.70948012232416e-06, 'epoch': 2.29}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0665, 'learning_rate': 3.319432860717265e-06, 'epoch': 2.5}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0644, 'learning_rate': 1.9293855991103697e-06, 'epoch': 2.71}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'loss': 0.0638, 'learning_rate': 5.393383375034752e-07, 'epoch': 2.92}



`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.


`max_length` is ignored when `padding`=`True` and ther

{'eval_loss': 0.10330181568861008, 'eval_runtime': 43.227, 'eval_samples_per_second': 221.898, 'eval_steps_per_second': 13.88, 'epoch': 3.0}
{'train_runtime': 2044.0043, 'train_samples_per_second': 56.312, 'train_steps_per_second': 3.52, 'train_loss': 0.09788898272351287, 'epoch': 3.0}


('../models\\tokenizer_config.json',
 '../models\\special_tokens_map.json',
 '../models\\vocab.txt',
 '../models\\added_tokens.json',
 '../models\\tokenizer.json')