In [33]:
import pandas as pd
from datasets import Dataset

# train_data = Dataset.from_pandas(pd.read_csv(r"MeDAL\pretrain_subset\train.csv"))
# # valid_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/medal-emnlp/pretrain_subset/valid.csv"))
# test_data  = Dataset.from_pandas(pd.read_csv(r"MeDAL\pretrain_subset\test.csv"))

# Read only a small subset of the data for faster processing
train_df = pd.read_csv(
    r"MeDAL\pretrain_subset\train.csv", nrows=1000
)  # Only read 1000 rows
train_data = Dataset.from_pandas(train_df)

# valid_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/medal-emnlp/pretrain_subset/valid.csv"))

test_df = pd.read_csv(
    r"MeDAL\pretrain_subset\test.csv", nrows=500
)  # Only read 500 rows
test_data = Dataset.from_pandas(test_df)

# print(f"Train data size: {len(train_data)}")
# print(f"Test data size: {len(test_data)}")

In [40]:
# Label encoding: map each unique string label to an integer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Combine all labels from train and test to fit the encoder
all_labels = np.concatenate([train_df["LABEL"].values, test_df["LABEL"].values])
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform train and test labels
train_df["LABEL_ID"] = label_encoder.transform(train_df["LABEL"])
test_df["LABEL_ID"] = label_encoder.transform(test_df["LABEL"])

# Save mapping for later use
id2label = {i: l for i, l in enumerate(label_encoder.classes_)}
label2id = {l: i for i, l in id2label.items()}  # Correct mapping direction

# Update datasets to use LABEL_ID
train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

In [42]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
import torch
from datasets import Dataset
import evaluate

In [41]:
# Tokenization
model_name = "bert-base-uncased"
output_dir = "fine_tuned_medal"
tokenizer = AutoTokenizer.from_pretrained(model_name)


# When using batched=False, each example is a dict, so LABEL_ID should be an int
def tokenize_function(example):
    result = tokenizer(
        example["TEXT"], truncation=True, padding="max_length", max_length=128
    )
    # Use LABEL_ID for integer class labels
    result["labels"] = int(example["LABEL_ID"])
    return result


train_subset = train_data.select(range(100))
test_subset = test_data.select(range(50))

train_tokenized = train_subset.map(
    tokenize_function, batched=False, remove_columns=["TEXT", "LABEL", "LABEL_ID"]
)
test_tokenized = test_subset.map(
    tokenize_function, batched=False, remove_columns=["TEXT", "LABEL", "LABEL_ID"]
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [45]:
def fine_tune_model(
    model_checkpoint,
    train_tokenized,
    test_tokenized,
    output_dir="fine_tuned_model",
    epochs=3,
    batch_size=16,
    num_labels=None,
    label_encoder=None,
):
    # Determine number of labels from label_encoder if not provided
    if num_labels is None and label_encoder is not None:
        num_labels = len(label_encoder.classes_)
    elif num_labels is None:
        raise ValueError("num_labels must be provided or label_encoder must be passed.")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric for evaluation
    accuracy_metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(axis=-1)
        return accuracy_metric.compute(predictions=predictions, references=labels)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        logging_steps=100,
        save_steps=500,
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available(),
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

    # Save model + tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, device

In [46]:
model, device = fine_tune_model(
    model_checkpoint=model_name,
    train_tokenized=train_tokenized,
    test_tokenized=test_tokenized,
    output_dir=output_dir,
    epochs=3,  # Number of training epochs
    batch_size=16,  # Batch size
    label_encoder=label_encoder,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fine-tuning...



Epoch,Training Loss,Validation Loss,Accuracy
1,7.3425,7.41918,0.0
2,7.1396,7.448646,0.0
3,7.0505,7.461875,0.0




Fine-tuning complete.



In [47]:
def mask_text(text, location):
    """
    Replace the token at the specified location with [MASK].
    """
    tokens = text.split()
    if 0 <= location < len(tokens):
        tokens[location] = "[MASK]"
    else:
        raise ValueError(f"Location {location} is out of bounds for text: {text}")
    return " ".join(tokens)


def predict_expansion(text, location, tokenizer, model, device, top_k=5):

    # Mask the abbreviation in the text
    masked_text = mask_text(text, location)

    # Tokenize the input
    inputs = tokenizer(masked_text, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Predict the masked token
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Find the position of the [MASK] token
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # Get top k predictions for the [MASK] token
    mask_token_logits = logits[0, mask_token_index, :]
    top_k_tokens = torch.topk(mask_token_logits, top_k, dim=1).indices[0].tolist()

    # Decode predictions
    predictions = [tokenizer.decode([token_id]).strip() for token_id in top_k_tokens]

    return {
        "original_text": text,
        "masked_text": masked_text,
        "predictions": predictions,
    }

In [49]:
from transformers import AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForMaskedLM.from_pretrained(output_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at fine_tuned_medal and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [51]:
# Step 3: Predict Abbreviation Expansion
example_text = "a new human EA glycoprotein has been identified by immunoblotting with mu monoclonal antibodies under nonreducing conditions the glycoprotein has a mw of and carries cromerrelated blood group antigens the monoclonal antibodies also react with normal IP blood leucocytes and platelets and several haemopoietic cell lines the glycoprotein has a reduced mw T3 sialidase treatment the mw is markedly reduced in tn ghosts and slightly increased in cad ghosts these results suggest that the glycoprotein has a substantial content of oglycans the glycoprotein appears to be absent from or grossly altered in the erythrocytes of two individuals with the rare inab phenotype"
abbreviation_location = 68
result = predict_expansion(
    text=example_text,
    location=abbreviation_location,
    tokenizer=tokenizer,
    model=model,
    device=device,
    top_k=5,  # Number of top predictions to return
)

# Display Results
print("=== Prediction Results ===")
print(f"Original Text: {result['original_text']}")
print(f"Masked Text: {result['masked_text']}")
print(f"Top Predictions: {result['predictions']}")

=== Prediction Results ===
Original Text: a new human EA glycoprotein has been identified by immunoblotting with mu monoclonal antibodies under nonreducing conditions the glycoprotein has a mw of and carries cromerrelated blood group antigens the monoclonal antibodies also react with normal IP blood leucocytes and platelets and several haemopoietic cell lines the glycoprotein has a reduced mw T3 sialidase treatment the mw is markedly reduced in tn ghosts and slightly increased in cad ghosts these results suggest that the glycoprotein has a substantial content of oglycans the glycoprotein appears to be absent from or grossly altered in the erythrocytes of two individuals with the rare inab phenotype
Masked Text: a new human EA glycoprotein has been identified by immunoblotting with mu monoclonal antibodies under nonreducing conditions the glycoprotein has a mw of and carries cromerrelated blood group antigens the monoclonal antibodies also react with normal IP blood leucocytes and plate