In [None]:
import os
import logging
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    RobertaTokenizerFast,
    RobertaModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    PreTrainedModel,
    RobertaConfig
)
from transformers.modeling_outputs import SequenceClassifierOutput

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("bug_localiser")


os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
data_path = "../../Data/SSTUBS_ENHANCED_23MAR"
dataset = load_dataset("json", data_files={
    "train": f"{data_path}/train.json",
    "validation": f"{data_path}/val.json",
    "test": f"{data_path}/test.json"
})


In [None]:
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

special_tokens = {
    "additional_special_tokens": ["[CONTEXT]", "[SNIPPET]", "[COMMIT]", "[PARENT]"]
}
tokenizer.add_special_tokens(special_tokens)

def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding=False)

tokenised_dataset = dataset.map(preprocess, batched=True)




In [None]:
class CustomRobertaClassifier(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.roberta.resize_token_embeddings(len(tokenizer))
        

        for name, param in self.roberta.named_parameters():
            if "embeddings" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False # freexe everying but embeddings

        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, config.num_labels)
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
model = CustomRobertaClassifier(RobertaConfig.from_pretrained(model_name, num_labels=2)).to(device)


  return torch.load(checkpoint_file, map_location="cpu")


In [None]:
class MultiEvalTrainer(Trainer):
    def __init__(self, *args, eval_datasets=None, **kwargs):
        if eval_datasets and len(eval_datasets) > 0:
            kwargs["eval_dataset"] = eval_datasets[0]
        super().__init__(*args, **kwargs)
        self.eval_datasets = eval_datasets or []

    def _compute_metrics_with_prefix(self, pred, prefix):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {
            f"{prefix}_accuracy": acc,
            f"{prefix}_precision": precision,
            f"{prefix}_recall": recall,
            f"{prefix}_f1": f1,
        }

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        if eval_dataset is None and self.eval_datasets:
            results = {}
            for i, eval_ds in enumerate(self.eval_datasets):
                prefix = f"{metric_key_prefix}_dataset_{i + 1}"
                logger.info(f" Evaluating dataset {i + 1} with prefix '{prefix}'...")
                eval_output = self.predict(eval_ds, ignore_keys=ignore_keys)
                metrics = self._compute_metrics_with_prefix(eval_output, prefix)
                self.log(metrics)
                results.update(metrics)
            return results
        return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)


In [None]:
output_dir = "../../FINAL_MODEL_UNFREEZE_EMBEDDINGS"
log_dir = f"{output_dir}/logs"


tokenised_dataset["train_val"] = tokenised_dataset["train"].shuffle(seed=42).select(range(min(500, len(tokenised_dataset["train"]))))
tokenised_dataset["validation"] = tokenised_dataset["validation"].shuffle(seed=42).select(range(min(500, len(tokenised_dataset["validation"]))))

training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=log_dir,
    report_to="tensorboard",               
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dataset_1_f1",
    greater_is_better=True,
)

trainer = MultiEvalTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
    eval_datasets=[tokenised_dataset["train_val"], tokenised_dataset["validation"]],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

In [None]:
try:
    logger.info("Training started...")
    trainer.train()
finally:
    log_path = f"{output_dir}/final_log.csv"
    pd.DataFrame(trainer.state.log_history).to_csv(log_path, index=False)
    logger.info(f" Logs saved to {log_path}")


2025-03-25 21:01:51,038 - INFO - Training started...
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


2025-03-25 21:02:50,044 - INFO -  Evaluating dataset 1 with prefix 'eval_dataset_1'...
2025-03-25 21:02:51,393 - INFO -  Evaluating dataset 2 with prefix 'eval_dataset_2'...
2025-03-25 21:03:51,794 - INFO -  Evaluating dataset 1 with prefix 'eval_dataset_1'...
2025-03-25 21:03:53,112 - INFO -  Evaluating dataset 2 with prefix 'eval_dataset_2'...


In [None]:
log_path = f"{output_dir}/final_log.csv"
log_df = pd.read_csv(log_path)

log_df = log_df[log_df["step"].notna()]


log_df["step"] = log_df["step"].astype(int)

print("Available logged metrics:\n", list(log_df.columns))

Available logged metrics:
 ['loss', 'learning_rate', 'epoch', 'step', 'eval_dataset_1_accuracy', 'eval_dataset_1_precision', 'eval_dataset_1_recall', 'eval_dataset_1_f1', 'eval_dataset_2_accuracy', 'eval_dataset_2_precision', 'eval_dataset_2_recall', 'eval_dataset_2_f1']


In [None]:
import pandas as pd
df = pd.read_csv("../../FINAL_MODEL_UNFREEZE_EMBEDDINGS/final_log.csv")
print(df.columns)

In [None]:
logger.info("Running evaluation manually after training...")
metrics = trainer.evaluate()
print(metrics)

2025-03-25 20:45:13,347 - INFO - 🔍 Running evaluation manually after training...
2025-03-25 20:45:13,351 - INFO -  Evaluating dataset 1 with prefix 'eval_dataset_1'...
2025-03-25 20:45:15,603 - INFO -  Evaluating dataset 2 with prefix 'eval_dataset_2'...


{'eval_dataset_1_accuracy': 0.598, 'eval_dataset_1_precision': 0.5584112149532711, 'eval_dataset_1_recall': 0.952191235059761, 'eval_dataset_1_f1': 0.7039764359351988, 'epoch': 0.13, 'eval_dataset_2_accuracy': 0.536, 'eval_dataset_2_precision': 0.5151515151515151, 'eval_dataset_2_recall': 0.9020408163265307, 'eval_dataset_2_f1': 0.655786350148368}
