In [1]:
import pandas as pd
from datasets import Dataset,Features,Sequence,Value
from transformers import AutoTokenizer,BertForTokenClassification
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
# Sample DataFrame with more text and labels
data = {
    'text': [
        "This is a sample text", 
        "Another sample text", 
        "A third sample text",
        "Text for the fourth example",
        "Fifth example text for testing",
        "Sixth sample text data",
        "Seventh example text",
        "Eighth sample data text",
        "Ninth text example for testing",
        "Tenth sample text data example"
    ],
    'labels': [
        "O O O O B-label", 
        "O O B-label O", 
        "B-label O O O",
        "O O O O O O", 
        "O O O O O O", 
        "O O O O", 
        "O O O O", 
        "O O O O", 
        "O O O O O O", 
        "O O O O O O O"
    ]
}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,text,labels
0,This is a sample text,O O O O B-label
1,Another sample text,O O B-label O
2,A third sample text,B-label O O O
3,Text for the fourth example,O O O O O O
4,Fifth example text for testing,O O O O O O
5,Sixth sample text data,O O O O
6,Seventh example text,O O O O
7,Eighth sample data text,O O O O
8,Ninth text example for testing,O O O O O O
9,Tenth sample text data example,O O O O O O O


In [6]:
#split text and labels into list
texts = df['text'].to_list()
print(texts)

['This is a sample text', 'Another sample text', 'A third sample text', 'Text for the fourth example', 'Fifth example text for testing', 'Sixth sample text data', 'Seventh example text', 'Eighth sample data text', 'Ninth text example for testing', 'Tenth sample text data example']


In [48]:
labels = df['labels'].to_list()
print(labels)

['O O O O B-label', 'O O B-label O', 'B-label O O O', 'O O O O O O', 'O O O O O O', 'O O O O', 'O O O O', 'O O O O', 'O O O O O O', 'O O O O O O O']


In [49]:
#tokenized text and split labels

tokenized_texts = [text.split() for text in texts]
print(tokenized_texts)

[['This', 'is', 'a', 'sample', 'text'], ['Another', 'sample', 'text'], ['A', 'third', 'sample', 'text'], ['Text', 'for', 'the', 'fourth', 'example'], ['Fifth', 'example', 'text', 'for', 'testing'], ['Sixth', 'sample', 'text', 'data'], ['Seventh', 'example', 'text'], ['Eighth', 'sample', 'data', 'text'], ['Ninth', 'text', 'example', 'for', 'testing'], ['Tenth', 'sample', 'text', 'data', 'example']]


In [50]:
tokenized_labels = [label.split() for label in labels]
print("Tokenized Labels:", tokenized_labels)


Tokenized Labels: [['O', 'O', 'O', 'O', 'B-label'], ['O', 'O', 'B-label', 'O'], ['B-label', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [51]:
unique_labels = list(set(label for label_list in tokenized_labels for label in label_list))
print("Unique Labels:", unique_labels)


Unique Labels: ['O', 'B-label']


In [52]:
# map labels to indices
label_to_index = {
    labels:idx 
    for idx,labels in enumerate(unique_labels)
}
print("Label to Index:", label_to_index)


Label to Index: {'O': 0, 'B-label': 1}


In [53]:
index_to_label = {idx: label for label, idx in label_to_index.items()}
print("Index to Label:", index_to_label)


Index to Label: {0: 'O', 1: 'B-label'}


In [54]:
indexed_labels = [[label_to_index[label] for label in label_list] for label_list in tokenized_labels]
print("Indexed Labels:", indexed_labels)
print("Length of indexed_labels:", len(indexed_labels))


Indexed Labels: [[0, 0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]
Length of indexed_labels: 10


In [55]:
print("Length of tokenized_texts:", len(tokenized_texts))
print("Length of indexed_labels:", len(indexed_labels))

Length of tokenized_texts: 10
Length of indexed_labels: 10


In [58]:
valid_texts = []
valid_labels = []
for i in range(len(tokenized_texts)):
    if len(tokenized_texts[i]) == len(indexed_labels[i]):
        valid_texts.append(tokenized_texts[i])
        valid_labels.append(indexed_labels[i])

print("Valid Texts:", valid_texts)
print("Valid Labels:", valid_labels)


Valid Texts: [['This', 'is', 'a', 'sample', 'text'], ['A', 'third', 'sample', 'text'], ['Sixth', 'sample', 'text', 'data'], ['Eighth', 'sample', 'data', 'text']]
Valid Labels: [[0, 0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]


In [59]:
flat_texts = [" ".join(tokens) for tokens in valid_texts]
flat_labels = [list(map(int, labels)) for labels in valid_labels]

data = {"text": flat_texts, "labels": flat_labels}
df_clean = pd.DataFrame(data)

print("Clean DataFrame:\n", df_clean)


Clean DataFrame:
                       text           labels
0    This is a sample text  [0, 0, 0, 0, 1]
1      A third sample text     [1, 0, 0, 0]
2   Sixth sample text data     [0, 0, 0, 0]
3  Eighth sample data text     [0, 0, 0, 0]


In [62]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_length = 128  # Set a fixed maximum length


In [63]:
features = Features({
    'text': Value('string'),
    'labels': Sequence(Value('int64')),
})

dataset = Dataset.from_pandas(df_clean, features=features)

print("Dataset:\n", dataset)


Dataset:
 Dataset({
    features: ['text', 'labels'],
    num_rows: 4
})


In [64]:
def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print("Tokenized Dataset:\n", tokenized_datasets)


Map: 100%|██████████| 4/4 [00:00<00:00, 546.26 examples/s]

Tokenized Dataset:
 Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4
})





In [65]:
def align_labels_with_tokens(examples):
    tokenized_inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        # Pad or truncate the label list to match max_length
        if len(label) < max_length:
            label += [-100] * (max_length - len(label))
        elif len(label) > max_length:
            label = label[:max_length]

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = tokenized_datasets.map(lambda x: align_labels_with_tokens(x), batched=True)

print("Aligned Tokenized Dataset:\n", tokenized_datasets)


Map: 100%|██████████| 4/4 [00:00<00:00, 729.79 examples/s]

Aligned Tokenized Dataset:
 Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4
})





In [66]:
train_test_data = tokenized_datasets.train_test_split(test_size=0.2)
train_data = train_test_data['train']
validation_data = train_test_data['test']

print("Training Data:\n", train_data)
print("Validation Data:\n", validation_data)


Training Data:
 Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3
})
Validation Data:
 Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1
})


In [68]:
model = BertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_labels))
from transformers import Trainer, TrainingArguments, BertForTokenClassification

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
)

print("Trainer Initialized")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-07-21 20:03:44.535782: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-21 20:03:44.544293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 20:03:44.552693: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been register

Trainer Initialized




In [69]:
trainer.train()
logger.info("Training completed.")


                                             
 33%|███▎      | 1/3 [00:01<00:02,  1.30s/it]

{'eval_loss': 0.46284523606300354, 'eval_runtime': 0.1079, 'eval_samples_per_second': 9.266, 'eval_steps_per_second': 9.266, 'epoch': 1.0}


                                             
 67%|██████▋   | 2/3 [00:02<00:01,  1.44s/it]

{'eval_loss': 0.351688951253891, 'eval_runtime': 0.1213, 'eval_samples_per_second': 8.246, 'eval_steps_per_second': 8.246, 'epoch': 2.0}


                                             
100%|██████████| 3/3 [00:05<00:00,  1.84s/it]
INFO:__main__:Training completed.


{'eval_loss': 0.3041325509548187, 'eval_runtime': 0.1362, 'eval_samples_per_second': 7.342, 'eval_steps_per_second': 7.342, 'epoch': 3.0}
{'train_runtime': 5.524, 'train_samples_per_second': 1.629, 'train_steps_per_second': 0.543, 'train_loss': 0.5036056439081827, 'epoch': 3.0}
