In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
#!pip install datasets

In [3]:
df = pd.read_csv('../data/ner.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [5]:
df.labels = df.labels.str.replace('B-tim', 'O').str.replace('I-tim', 'O').str.replace('B-art','O').str.replace('I-art', 'O').str.replace('B-nat', 'O').str.replace('I-nat','O')

In [6]:
from datasets import Dataset
from transformers import BertTokenizerFast

# Use the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

texts = df['text'].tolist()  # Text sequences
labels_text = df['labels'].tolist()  # Corresponding labels
texts_split = [text.split() for text in texts]  # Tokenized text into words

# Create a set of unique labels and map them to indices
set_labels = set([label for label_seq in labels_text for label in label_seq.split()])
num_labels = len(set_labels)

dict_labels = {label: idx for idx, label in enumerate(set_labels)}  # Label -> index mapping

# Function to tokenize and align labels
def tokenize_and_align_labels(texts, labels, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, max_length=128, is_split_into_words=True, truncation=True, padding='max_length')
    word_ids = tokenized_inputs.word_ids()  # Get the word indices
    previous_word_idx = None
    label_ids = []

    # Align the labels with the tokenized inputs
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(0)  # Ignore subword tokens
        elif word_idx != previous_word_idx:
            label_ids.append(dict_labels[labels[word_idx]])  # Assign label to the first wordpiece
        else:
            label_ids.append(dict_labels[labels[word_idx]] if label_all_tokens else 0)  # Option to propagate label to subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Initialize lists to collect tokenized inputs
input_ids = []
attention_mask = []
labels = []

# Tokenize each sample and align its labels
for i in range(len(texts_split)):
    out = tokenize_and_align_labels(texts_split[i], labels_text[i].split())
    input_ids.append(out['input_ids'])
    attention_mask.append(out['attention_mask'])
    labels.append(out['labels'])

# Convert the merged tokenized inputs into a Dataset object
dataset_dict = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': labels
}
dataset = Dataset.from_dict(dataset_dict)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

In [9]:
!pip install --upgrade transformers accelerate
import mlflow
import torch
from transformers import BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=GK5JXGH63PID8Q1HJM6LG0TDO32KEP
%env CLEARML_API_SECRET_KEY=CpyJCNKsOPWU13ypw_uIk_f84U5Lax8ntgeHbm16s61dpNA1LIoUNfCFSKW6hz_VGm4
# End the previous run
mlflow.end_run()

# Load pre-trained BERT model with classification head
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize the data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,              # Pass the tokenizer
    padding=True,                     # Enable dynamic padding
    max_length=128,                   # Set max length
    label_pad_token_id=0          # Use -100 to ignore padding labels
)



# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',               # Output directory for model predictions and checkpoints
    num_train_epochs=3,                   # Total number of training epochs
    per_device_train_batch_size=16,       # Batch size per device during training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    evaluation_strategy="epoch",          # Evaluate every epoch
    save_steps=10_000,                    # Save checkpoint every 10,000 steps
    save_total_limit=2,                   # Limit the total number of checkpoints
    logging_dir='./logs',                 # Directory for storing logs
    learning_rate=2e-5,                   # Learning rate
    weight_decay=0.01,
    disable_tqdm=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator          # Data collator for padding and aligning
)

# Train the model
trainer.train()
trainer.save_model(output_dir='../models')  # Save model to specified directory
tokenizer.save_pretrained('../models')


env: CLEARML_WEB_HOST=https://app.clear.ml/

env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=GK5JXGH63PID8Q1HJM6LG0TDO32KEP
env: CLEARML_API_SECRET_KEY=CpyJCNKsOPWU13ypw_uIk_f84U5Lax8ntgeHbm16s61dpNA1LIoUNfCFSKW6hz_VGm4


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead






`max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`.



AttributeError: 'AdamW' object has no attribute 'train'

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten the predictions and labels for calculating metrics
    pred_flat = preds.flatten()
    labels_flat = labels.flatten()

    # Only consider non-padding labels
    non_pad_indices = labels_flat != -100
    pred_flat = pred_flat[non_pad_indices]
    labels_flat = labels_flat[non_pad_indices]

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(labels_flat, pred_flat)
    precision, recall, f1, _ = precision_recall_fscore_support(labels_flat, pred_flat, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Add the compute_metrics function to the Trainer
trainer.compute_metrics = compute_metrics

# Run evaluation
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation results:", eval_results)

# Optionally, log evaluation results to MLflow
with mlflow.start_run():
    mlflow.log_params(training_args.to_dict())
    mlflow.log_metrics(eval_results)
