In [None]:
from datasets import Dataset 
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments
from transformers import AutoTokenizer
import torch
import pandas as pd
import evaluate


In [None]:
from datasets import load_dataset
dataset=load_dataset("lutful2004/MeDAL-small-data")
# Access the 'train' split and convert to DataFrame
full_data=dataset['train'].to_pandas()
# full_data = pd.read_csv(r"smaller_datasets/full_data_small.csv")
sample=full_data[:200]

In [None]:
def expand_dataset(df):
    rows = []  # Use a different name for the list to collect new rows
    for _, row_data in df.iterrows():
        text = row_data['TEXT']
        labels = row_data['LABEL'].split("|")
        locations = row_data['LOCATION'].split("|")
        tokens = text.split()
        for loc, label in zip(locations, labels):
            try:
                idx = int(loc)
                if 0 <= idx < len(tokens):
                    token = tokens[idx]
                    rows.append((text, loc, token, label))
            except ValueError:
                # Skip if location is not a valid integer
                continue
    return pd.DataFrame(rows, columns=['TEXT', 'LOCATION', 'ABBREV', 'LABEL'])

sample= expand_dataset(sample)

In [None]:
sample.columns

In [None]:
# labele mapper
label2id={label: i for i,label in enumerate(sample['LABEL'].unique())}
sample["LABEL_ID"]=sample["LABEL"].map((label2id))

In [None]:
# insert entity markers 
def insert_entity_markers(row):
    loc=int(row["LOCATION"])
    abbr=row["ABBREV"]
    text=row["TEXT"]
    splited_text=text.split(" ")
    marked_text = splited_text[:loc] + ["[E1]"] + splited_text[loc:loc+1]+ ["[/E1]"] + splited_text[loc+1:]
    row["marked_text"]=" ".join(marked_text)
    row["labels"]=row["LABEL_ID"]
    return row

dataset=Dataset.from_pandas(sample)
dataset=dataset.map(insert_entity_markers)


In [None]:
checkpoint="bert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.add_special_tokens({"additional_special_tokens": ["[E1]", "[/E1]"]})
def tokenize_fn(row):
    return tokenizer(row["marked_text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)

In [None]:
tokenized_dataset

In [None]:
for key in tokenized_dataset.features.keys():
    print(f"{key}: {tokenized_dataset[key][0]}")


In [None]:
# # Get the vocabulary
# # To see special tokens
# print("Special tokens:", tokenizer.special_tokens_map)
# print("\nAdded special tokens:", tokenizer.additional_special_tokens)

# # To see vocab size
# print("\nVocabulary size:", len(tokenizer))

# # To see how the tokenizer handles our entity markers
# example = "This is an [E1] example [/E1] sentence."
# encoded = tokenizer(example)
# print("\nEncoded:", encoded)
# print("\nDecoded:", tokenizer.decode(encoded["input_ids"]))

In [None]:
# # Get the vocabulary
# # To see special tokens
# print("Special tokens:", tokenizer.special_tokens_map)
# print("\nAdded special tokens:", tokenizer.additional_special_tokens)

# # To see vocab size
# print("\nVocabulary size:", len(tokenizer))

# # To see how the tokenizer handles our entity markers
# example = "[E1] [/E1]"
# encoded = tokenizer(example)
# print("\nEncoded:", encoded)
# print("\nDecoded:", tokenizer.decode(encoded["input_ids"]))

In [None]:
len(tokenizer)

In [None]:
tokenizer.get_special_tokens_mask

In [None]:
from transformers import AutoModel
model=AutoModel.from_pretrained(checkpoint,label2id=label2id)
model.resize_token_embeddings(len(tokenizer))

In [None]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, BertForSequenceClassification

# Replace the base BERT model with a classification-specific one
model = BertForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=len(label2id),
    id2label={v: k for k, v in label2id.items()},
    label2id=label2id
)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./label_classification",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_steps=5,
    save_steps=50,
    learning_rate=5e-5, 
    weight_decay=0.01,
    do_eval=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()