In [1]:
import pandas as pd
from datasets import Dataset

train_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/medal-emnlp/pretrain_subset/train.csv"))
# valid_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/medal-emnlp/pretrain_subset/valid.csv"))
test_data  = Dataset.from_pandas(pd.read_csv("/kaggle/input/medal-emnlp/pretrain_subset/test.csv"))
# print(train_data)
# print(valid_data)
# print(test_data)

In [3]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
import torch
from datasets import Dataset
import evaluate


ModuleNotFoundError: No module named 'evaluate'

In [None]:
# Tokenization
def tokenize_function(example):
    return tokenizer(
        example["TEXT"], truncation=True, padding="max_length", max_length=128
    )


train_tokenized = train_data.map(
    tokenize_function, batched=True, remove_columns=["TEXT"]
)
test_tokenized = test_data.map(tokenize_function, batched=True, remove_columns=["TEXT"])

In [None]:

def fine_tune_model_and_tokenizer(
    model_checkpoint, train_tokenized, test_tokenized 
    output_dir="fine_tuned_model", epochs=3, batch_size=16, num_labels=10
):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Metric for evaluation
    accuracy_metric = load_metric("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(axis=-1)
        acc = accuracy_metric.compute(predictions=predictions, references=labels)
        return acc
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        evaluation_strategy="steps",
        save_strategy="steps",
        logging_strategy="steps",
        logging_steps=100,
        save_steps=500,
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        fp16=torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")

    # Save model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return tokenizer, model, device

model_name = "bert-base-uncased"
output_dir = "fine_tuned_medal" 
tokenizer, model, device = fine_tune_model_and_tokenizer(
    model_checkpoint=model_name,
    train_tokenized=train_tokenized,
    test_toknized=test_tokenized,
    output_dir=output_dir,
    epochs=3,  # Number of training epochs
    batch_size=16  # Batch size
)


In [None]:
def mask_text(text, location):
    """
    Replace the token at the specified location with [MASK].
    """
    tokens = text.split()
    if 0 <= location < len(tokens):
        tokens[location] = "[MASK]"
    else:
        raise ValueError(f"Location {location} is out of bounds for text: {text}")
    return " ".join(tokens)

def predict_expansion(text, location, tokenizer, model, device, top_k=5):

    # Mask the abbreviation in the text
    masked_text = mask_text(text, location)
    
    # Tokenize the input
    inputs = tokenizer(masked_text, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # Predict the masked token
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Find the position of the [MASK] token
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # Get top k predictions for the [MASK] token
    mask_token_logits = logits[0, mask_token_index, :]
    top_k_tokens = torch.topk(mask_token_logits, top_k, dim=1).indices[0].tolist()

    # Decode predictions
    predictions = [tokenizer.decode([token_id]).strip() for token_id in top_k_tokens]

    return {
        "original_text": text,
        "masked_text": masked_text,
        "predictions": predictions,
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForMaskedLM.from_pretrained(output_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:

# Step 3: Predict Abbreviation Expansion
example_text = "a new human EA glycoprotein has been identified by immunoblotting with mu monoclonal antibodies under nonreducing conditions the glycoprotein has a mw of and carries cromerrelated blood group antigens the monoclonal antibodies also react with normal IP blood leucocytes and platelets and several haemopoietic cell lines the glycoprotein has a reduced mw T3 sialidase treatment the mw is markedly reduced in tn ghosts and slightly increased in cad ghosts these results suggest that the glycoprotein has a substantial content of oglycans the glycoprotein appears to be absent from or grossly altered in the erythrocytes of two individuals with the rare inab phenotype"
abbreviation_location = 68  
result = predict_expansion(
    text=example_text,
    location=abbreviation_location,
    tokenizer=tokenizer,
    model=model,
    device=device,
    top_k=5  # Number of top predictions to return
)

# Display Results
print("=== Prediction Results ===")
print(f"Original Text: {result['original_text']}")
print(f"Masked Text: {result['masked_text']}")
print(f"Top Predictions: {result['predictions']}")