In [1]:
from datasets import Dataset 
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments
from transformers import AutoTokenizer
import torch
import pandas as pd
import evaluate
full_data = pd.read_csv(r"smaller_datasets/full_data_small.csv")
sample=full_data[:200]

In [2]:
def expand_dataset(df):
    rows = []  # Use a different name for the list to collect new rows
    for _, row_data in df.iterrows():
        text = row_data['TEXT']
        labels = row_data['LABEL'].split("|")
        locations = row_data['LOCATION'].split("|")
        tokens = text.split()
        for loc, label in zip(locations, labels):
            try:
                idx = int(loc)
                if 0 <= idx < len(tokens):
                    token = tokens[idx]
                    rows.append((text, loc, token, label))
            except ValueError:
                # Skip if location is not a valid integer
                continue
    return pd.DataFrame(rows, columns=['TEXT', 'LOCATION', 'ABBREV', 'LABEL'])

sample= expand_dataset(sample)

In [3]:
sample.columns

Index(['TEXT', 'LOCATION', 'ABBREV', 'LABEL'], dtype='object')

In [4]:
# labele mapper
label2id={label: i for i,label in enumerate(sample['LABEL'].unique())}
sample["LABEL_ID"]=sample["LABEL"].map((label2id))

In [5]:
# insert entity markers 
def insert_entity_markers(row):
    loc=int(row["LOCATION"])
    abbr=row["ABBREV"]
    text=row["TEXT"]
    splited_text=text.split(" ")
    marked_text = splited_text[:loc] + ["[E1]"] + splited_text[loc:loc+1]+ ["[/E1]"] + splited_text[loc+1:]
    row["marked_text"]=" ".join(marked_text)
    row["labels"]=row["LABEL_ID"]
    return row

dataset=Dataset.from_pandas(sample)
dataset=dataset.map(insert_entity_markers)


Map:   0%|          | 0/722 [00:00<?, ? examples/s]

In [6]:
checkpoint="bert-base-uncased"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"additional_special_tokens": ["[E1]", "[/E1]"]})
def tokenize_fn(row):
    return tokenizer(row["marked_text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

Dataset({
    features: ['TEXT', 'LOCATION', 'ABBREV', 'LABEL', 'LABEL_ID', 'marked_text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 722
})

In [9]:
for key in tokenized_dataset.features.keys():
    print(f"{key}: {tokenized_dataset[key][0]}")


TEXT: alphabisabolol has a primary antipeptic action depending on dosage which is not caused by an alteration of the phvalue the proteolytic activity of pepsin is reduced by percent through addition of bisabolol in the ratio of the antipeptic action of bisabolol only occurs in case of direct contact in case of a previous contact with the ATP the inhibiting effect is lost
LOCATION: 56
ABBREV: ATP
LABEL: substrate
LABEL_ID: 0
marked_text: alphabisabolol has a primary antipeptic action depending on dosage which is not caused by an alteration of the phvalue the proteolytic activity of pepsin is reduced by percent through addition of bisabolol in the ratio of the antipeptic action of bisabolol only occurs in case of direct contact in case of a previous contact with the [E1] ATP [/E1] the inhibiting effect is lost
labels: 0
input_ids: [101, 6541, 18477, 7875, 12898, 2140, 2038, 1037, 3078, 3424, 5051, 20746, 2895, 5834, 2006, 9998, 4270, 2029, 2003, 2025, 3303, 2011, 2019, 26014, 1997, 1996,

In [10]:
# # Get the vocabulary
# # To see special tokens
# print("Special tokens:", tokenizer.special_tokens_map)
# print("\nAdded special tokens:", tokenizer.additional_special_tokens)

# # To see vocab size
# print("\nVocabulary size:", len(tokenizer))

# # To see how the tokenizer handles our entity markers
# example = "This is an [E1] example [/E1] sentence."
# encoded = tokenizer(example)
# print("\nEncoded:", encoded)
# print("\nDecoded:", tokenizer.decode(encoded["input_ids"]))

In [11]:
# # Get the vocabulary
# # To see special tokens
# print("Special tokens:", tokenizer.special_tokens_map)
# print("\nAdded special tokens:", tokenizer.additional_special_tokens)

# # To see vocab size
# print("\nVocabulary size:", len(tokenizer))

# # To see how the tokenizer handles our entity markers
# example = "[E1] [/E1]"
# encoded = tokenizer(example)
# print("\nEncoded:", encoded)
# print("\nDecoded:", tokenizer.decode(encoded["input_ids"]))

In [12]:
len(tokenizer)

30524

In [14]:
tokenizer.get_special_tokens_mask

<bound method PreTrainedTokenizerBase.get_special_tokens_mask of BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[E1]', '[/E1]']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30522: AddedToke

In [16]:
from transformers import AutoModel
model=AutoModel.from_pretrained(checkpoint,label2id=label2id)
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(30524, 768, padding_idx=0)

In [17]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
from transformers import Trainer, BertForSequenceClassification

# Replace the base BERT model with a classification-specific one
model = BertForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=len(label2id),
    id2label={v: k for k, v in label2id.items()},
    label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./label_classification",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_steps=5,
    save_steps=50,
    learning_rate=5e-5, 
    weight_decay=0.01,
    do_eval=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
5,5.6171
10,5.6204
15,5.8484
20,5.8104


KeyboardInterrupt: 