In [None]:
import json

with open("./data/experiment/biogpt_finetune.json") as f:
    biogpt_finetune = json.load(f)

In [None]:
import json

with open("./train_data.json") as f:
    train = json.load(f)

with open("./val_data.json") as f:
    val = json.load(f)

with open("./test_data.json") as f:
    test = json.load(f)

In [11]:
def extract_captions(lst_with_reports: list):
    reports_he = []
    reports_full = []
    for specimen in lst_with_reports:
        reports_he.append(specimen['caption'])
        reports_full.append(specimen['caption_all'])
    return reports_he, reports_full

In [12]:
train_he, train_full = extract_captions(train)
val_he, val_full = extract_captions(val)
test_he, test_full = extract_captions(test)

In [13]:
def flexible_write(lst, file_name="output.txt"):
    with open(file_name, "w") as file:
        for item in lst:
            file.write(f"{item}\n")

flexible_write(train_he, file_name="train_he.txt")
flexible_write(train_full, file_name="train_full.txt")

flexible_write(val_he, file_name="val_he.txt")
flexible_write(val_full, file_name="val_full.txt")

flexible_write(test_he, file_name="test_he.txt")
flexible_write(test_full, file_name="test_full.txt")

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
)

def main():
    print("=== Starting main() ===")
    data_files = {
        "train": "train_he.txt",
        "validation": "val_he.txt",
        "test": "test_he.txt"
    }
    
    # -------------------------------------------------
    # 1. Load Raw Data (non-streaming so we can get lengths)
    # -------------------------------------------------
    print("Loading raw datasets from files:", data_files)
    raw_datasets = load_dataset("text", data_files=data_files, streaming=False)
    print("Raw datasets loaded successfully.")

    # -------------------------------------------------
    # 2. Load Model & Tokenizer
    # -------------------------------------------------
    model_name = "microsoft/biogpt"  # or "microsoft/BioGPT-Large"
    print("Loading tokenizer and model:", model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print("Pad token not found. Using EOS token as pad token.")

    model = AutoModelForCausalLM.from_pretrained(model_name)
    print("Model loaded successfully.")

    # Freeze all parameters except for the output projection
    print("Freezing model parameters except output projection...")
    for name, param in model.named_parameters():
        param.requires_grad = False
    for param in model.output_projection.parameters():
        param.requires_grad = True
    for param in model.output_projection.parameters():
        print(f"The last layer of BioGPT is trainable: {param.requires_grad}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()  # set model to training mode
    print(f"Model moved to device: {device}")

    # -------------------------------------------------
    # 3. Tokenize the Data
    # -------------------------------------------------
    print("Starting tokenization of datasets.")
    def tokenize_function(examples):
        # Add an explicit EOS token to the text
        text_with_eos = [text + " " + tokenizer.eos_token for text in examples["text"]]
        return tokenizer(text_with_eos)

    # We remove the original "text" column
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    print("Tokenization complete.")

    train_dataset = tokenized_datasets["train"]
    val_dataset   = tokenized_datasets["validation"]
    test_dataset  = tokenized_datasets["test"]
    print("Datasets split into train, validation, and test.")

    # -------------------------------------------------
    # 4. Prepare Dataloaders
    # -------------------------------------------------
    print("Preparing dataloaders...")
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True,  collate_fn=data_collator)
    val_loader   = DataLoader(val_dataset,   batch_size=75, shuffle=False, collate_fn=data_collator)
    test_loader  = DataLoader(test_dataset,  batch_size=75, shuffle=False, collate_fn=data_collator)
    print("Dataloaders are ready.")

    # -------------------------------------------------
    # 5. Define Optimizer 
    # -------------------------------------------------
    print("Setting up optimizer.")
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, betas=(0.9, 0.99))
    print("Optimizer is ready.")

    # Ensure the parent folder "3" exists
    parent_folder = "best_model3"
    os.makedirs(parent_folder, exist_ok=True)

    # -------------------------------------------------
    # 6. Training Loop (saving model per epoch)
    # -------------------------------------------------
    num_epochs = 5

    print("Starting training loop...")
    for epoch in range(num_epochs):
        print(f"--- Epoch {epoch+1} starting ---")
        model.train()
        total_train_loss = 0.0

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["input_ids"]
            )
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if (step + 1) % 50 == 0:
                print(f"Epoch {epoch+1}, step {step+1}, loss = {loss.item():.4f}")

        avg_train_loss = total_train_loss / len(train_loader)
        print(total_train_loss)
        print(f"Epoch {epoch+1}: average training loss = {avg_train_loss:.4f}")

        # -------------------------------------------------
        # Validation
        # -------------------------------------------------
        print(f"Starting validation for epoch {epoch+1}")
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["input_ids"]
                )
                total_val_loss += outputs.loss.item()

        val_loss = total_val_loss / len(val_dataset)
        print(f"Epoch {epoch+1}: validation loss = {val_loss:.4f}")

        # Save the model checkpoint for this epoch inside folder "3"
        current_model_path = os.path.join(parent_folder, f"best_model3.{epoch+1}")
        print(f"Saving model checkpoint for epoch {epoch+1} to {current_model_path}")
        model.save_pretrained(current_model_path)
        tokenizer.save_pretrained(current_model_path)

if __name__ == "__main__":
    main()


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

def main(output_file, best_model_path):
    print("Loading the best model for final test evaluation...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load model in FP16 mode for faster inference on A100
    best_model = AutoModelForCausalLM.from_pretrained(
        best_model_path, 
        torch_dtype=torch.float16, 
        device_map="auto"  # Automatically places layers on the GPU
    )
    best_model.to(device)
    best_model.eval()

    print("Generating predictions for each test instance...")
    num_test_samples = 1970
    print(f"Number of test instances: {num_test_samples}")
    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
    
    # Encode the prompt once (here an empty string)
    prompt_text = ""
    encoded_prompt = tokenizer.encode(prompt_text, return_tensors="pt").to(device)
    
    # Set batch size based on available GPU memory
    batch_size = 16  # Adjust this value as needed for your A100
    with open(output_file, "w", encoding="utf-8") as f:
        for i in tqdm(range(0, num_test_samples, batch_size), desc="Generating reports", unit="batch"):
            current_batch_size = min(batch_size, num_test_samples - i)
            # Replicate the prompt for the current batch
            input_ids = encoded_prompt.repeat(current_batch_size, 1)
            with torch.no_grad():
                outputs = best_model.generate(
                    input_ids=input_ids,
                    num_beams=4,  # Beam search for better outputs
                    max_new_tokens=1024,
                    top_p=1.0,
                    repetition_penalty=1.2,
                    length_penalty=1.1,
                    do_sample=False,
                    eos_token_id=tokenizer.eos_token_id
                )
            # Decode and write outputs for each sample in the batch
            for output in outputs:
                predicted_caption = tokenizer.decode(output, skip_special_tokens=True)
                f.write(predicted_caption + "\n")

    print(f"Predictions for all {num_test_samples} test instances have been saved to '{output_file}'.")
    print("=== Finished main() ===")

if __name__ == "__main__":
    output_file = "generated_captions.txt"
    best_model_path = "best_model"
    main(output_file, best_model_path)

In [None]:
from pycocoevalcap.bleu.bleu_scorer import BleuScorer
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

def compute_scores(gts, res): 
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids and their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Set up scorers
    scorers = [
        (Bleu(4), ["BLEU_1", "BLEU_2", "BLEU_3", "BLEU_4"]),
        (Meteor(), "METEOR"), # sudo apt-get install default-jre <- to install java and not return error
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        # (Spice(), "SPICE") 
    ]

    eval_res = {}
    eval_res_total = {}
    # Compute score for each metric
    for scorer, method in scorers:
        try:
            score, scores = scorer.compute_score(gts, res)
        except TypeError:
            score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                eval_res[m] = sc
            for sc, m in zip(scores, method):
                eval_res_total[m] = sc
        else:
            eval_res[method] = score
            eval_res_total[method] = scores


    return eval_res, eval_res_total

# The following code is taken from the pycocoevalcap library
# It is needed to put bleu_scorer.compute_score to verbose = 0

class Bleu:
    def __init__(self, n=4):
        # default compute Blue score up to 4
        self._n = n
        self._hypo_for_image = {}
        self.ref_for_image = {}

    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
        
        # return (bleu, bleu_info)
        return score, scores

    def method(self):
        return "Bleu"

In [13]:
with open('data/test_he.txt', 'r') as file:
    ground_truth = [line.strip() for line in file]

with open('generated_captions_he_epoch_5_5e-5_sample_true.txt', 'r') as file:
    generated_captions = [line.strip() for line in file]

In [14]:
eval_results = []

for gts, res in zip(ground_truth, generated_captions):
    eval_results.append({'gt_caption':gts, 'caption':res})

gts = {i: [gt] for i, gt in enumerate([x['gt_caption'] for x in eval_results])}
res = {i: [re] for i, re in enumerate([x['caption'] for x in eval_results])}

In [15]:
eval_res, eval_res_total = compute_scores(gts, res)