In [12]:
!pip install pandas~=2.2.1 numpy~=1.26.4 torch~=2.2.2 scikit-learn~=1.4.1.post1 transformers~=4.39.3



In [13]:
import gc
import os
import os.path
import random
import re
from time import time
from typing import Tuple, Union, Dict

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ShuffleSplit
from torch import Tensor, optim, nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer

In [14]:
# from google.colab import drive

PATH_TO_DRIVE = "/content/drive/MyDrive/Colab Notebooks/Datasets/"
PATH_TO_DATASET = "dataset"

# drive.mount('/content/drive/', force_remount=True)

In [15]:
# zip_ref = zipfile.ZipFile(os.path.join(PATH_TO_DRIVE, "sc.zip"), 'r')
# zip_ref.extractall(PATH_TO_DRIVE)
# zip_ref.close()

# Utils and Device Settings

In [16]:
SEPARATOR = {"stars": "".join(["*"] * 100), "dashes": "".join(["-"] * 100), "dots": "".join(["."] * 100)}


def get_device(device_type: str) -> torch.device:
    if device_type == "cpu":
        print("\n Running on device 'cpu' \n")
        return torch.device("cpu")

    if re.match(r"\bcuda:\b\d+", device_type):
        if not torch.cuda.is_available():
            print("\n WARNING: running on cpu since device {} is not available \n".format(device_type))
            return torch.device("cpu")

        gc.collect()
        torch.cuda.empty_cache()
        print("\n Running on device '{}' \n".format(device_type))
        return torch.device(device_type)

    raise ValueError("ERROR: {} is not a valid device! Supported device are 'cpu' and 'cuda:n'".format(device_type))


def make_deterministic(seed: int):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False

In [17]:
DEVICE_TYPE = "cuda:0"
DEVICE = get_device(DEVICE_TYPE)





In [18]:
class LossTracker:

    def __init__(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def update(self, val: float, n: int = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class MetricsTracker:

    def __init__(self):
        super().__init__()
        self._predictions, self._true_labels = [], []
        self.__monitored_metrics = ["accuracy", "precision", "recall", "f1"]
        self._metrics, self._best_metrics = {}, {m: 0 for m in self.__monitored_metrics}

    def compute_metrics(self) -> Dict:
        predictions_flat = np.argmax(np.concatenate(self._predictions, axis=0), axis=1)
        true_labels_flat = np.concatenate(self._true_labels, axis=0)
        print(predictions_flat, true_labels_flat)
        self._metrics = {
            "accuracy": accuracy_score(true_labels_flat, predictions_flat),
            "precision": precision_score(true_labels_flat, predictions_flat, average='macro'),
            "recall": recall_score(true_labels_flat, predictions_flat, average='macro'),
            "f1": f1_score(true_labels_flat, predictions_flat, average='macro')
        }
        return self._metrics

    def update_best_metrics(self) -> Dict:
        self._best_metrics["mean"] = self._metrics["mean"]
        self._best_metrics["median"] = self._metrics["median"]
        self._best_metrics["trimean"] = self._metrics["trimean"]
        self._best_metrics["bst25"] = self._metrics["bst25"]
        self._best_metrics["wst25"] = self._metrics["wst25"]
        self._best_metrics["wst5"] = self._metrics["wst5"]
        return self._best_metrics

    def update(self, predictions: np.ndarray, true_labels: np.ndarray):
        self._predictions.append(predictions)
        self._true_labels.append(true_labels)

    def reset(self):
        self._predictions, self._true_labels = [], []

    def get_best_metrics(self) -> Dict:
        return self._best_metrics

# Data Handling

In [19]:
class CGTDataset(Dataset):

    def __init__(self, train: bool = True, fold_num: int = 0, data_type: str = "all"):
        self._train = train
        self.__data_type = data_type
        self._items = self.__build_dataset()
        ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
        train_idx, test_idx = [(i_train, i_test) for i_train, i_test in ss.split(range(len(self._items)))][fold_num]
        self.__fold_idx = train_idx if train else test_idx

    @staticmethod
    def __encode(text: str):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        return tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    def __make_item(self, dataset: pd.DataFrame):
        item = {}
        path2bytecode = os.path.join(PATH_TO_DATASET, "bytecode", dataset['fp_bytecode'].values[0] + ".hex")
        if (self.__data_type == "all" or self.__data_type == "bytecode") and os.path.exists(path2bytecode):
            item['path2bytecode'] = path2bytecode
        path2runtime = os.path.join(PATH_TO_DATASET, "runtime", dataset['fp_runtime'].values[0] + ".rt.hex")
        if (self.__data_type == "all" or self.__data_type == "runtime") and os.path.exists(path2runtime):
            item['path2runtime'] = path2runtime
        path2source = os.path.join(PATH_TO_DATASET, "source", dataset['fp_sol'].values[0] + ".sol")
        if (self.__data_type == "all" or self.__data_type == "source") and os.path.exists(path2source):
            item['path2source'] = path2source
        return item

    def __build_dataset(self):
        dataset = pd.read_csv(os.path.join(PATH_TO_DATASET, "consolidated.csv"), sep=";")
        gt, items = {"none": 0}, []
        for _, row in dataset.iterrows():
            prop = row["property"].lower() if row['property_holds'] == 't' else 'none'
            if prop not in gt.keys():
                gt[prop] = len(gt.values())
            item = self.__make_item(dataset)
            item["gt"] = gt[prop]
            items.append(item)
        return items

    def _load_f(self, path_to_item: str) -> Tensor:
        with open(path_to_item, 'r') as fp:
            return self.__encode(fp.read())

    def _load_input(self, index: int) -> Dict:
        item = self._items[index]
        return {k.split("path2")[-1]: self._load_f(v) for k, v in item.items() if "path2" in k}

    def _load_label(self, index: int) -> Tensor:
        return self._items[index]['gt']

    def __getitem__(self, index: int) -> Tuple[Dict, Tensor]:
        index = self.__fold_idx[index]
        x, y = self._load_input(index), self._load_label(index)
        return x, y

    def __len__(self) -> int:
        return len(self.__fold_idx)


class DataHandler:
    def __init__(self):
        self._dataset = CGTDataset

    def train_test_loaders(self, fold_num: int) -> Tuple:
        training_loader = self.get_loader(train=True, fold_num=fold_num)
        test_loader = self.get_loader(train=False, fold_num=fold_num)
        return training_loader, test_loader

    def get_loader(self, train: bool, fold_num: int) -> DataLoader:
        dataset = self._dataset(train, fold_num)
        # return DataLoader(dataset, batch_size=1, shuffle=train, num_workers=cpu_count(), drop_last=True)
        return DataLoader(dataset, batch_size=1, shuffle=train, drop_last=True)

# Model

In [20]:
class BERT(nn.Module):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=159)

    def forward(self, input_id: Tensor, attention_mask: Tensor, label: Tensor) -> Tensor:
        input_id = input_id.squeeze(1)
        attention_mask = attention_mask.squeeze(1)
        return self.__bert(input_ids=input_id, attention_mask=attention_mask, labels=label)


class ModelBERT:

    def __init__(self):
        super().__init__()
        self._device = DEVICE
        self._optimizer = None
        self._strategy = BERT().to(self._device)

    def predict(self, x: Dict, y: Tensor, *args, **kwargs) -> Union[Tensor, any]:
        x = x["source"]
        input_id, attention_mask = x["input_ids"], x["attention_mask"]
        return self._strategy(input_id, attention_mask, y)

    def optimize(self, x: Dict, y: Tensor) -> float:
        self._optimizer.zero_grad()
        pred = self.predict(x, y)
        loss = self.get_loss(pred)
        loss.backward()
        self._optimizer.step()
        return loss.item()

    @staticmethod
    def get_loss(pred: Tensor) -> Tensor:
        return pred.loss

    def log_strategy(self, path_to_log: str):
        open(os.path.join(path_to_log, "strategy.txt"), 'a+').write(str(self._strategy))

    def train_mode(self):
        self._strategy = self._strategy.train()

    def eval_mode(self):
        self._strategy = self._strategy.eval()

    def save(self, path_to_save: str):
        path_to_pth = os.path.join(path_to_save, "model.pth")
        print("\nSaving model at {}...".format(path_to_pth))
        torch.save(self._strategy.state_dict(), path_to_pth)
        print("... Model saved successfully!\n")

    def set_optimizer(self, learning_rate: float, optimizer_type: str = "adamw"):
        optimizers_map = {"adam": optim.Adam, "adamw": optim.AdamW, "rmsprop": optim.RMSprop, "sgd": optim.SGD}
        self._optimizer = optimizers_map[optimizer_type](self._strategy.parameters(), lr=learning_rate)

# Training

In [21]:
class TrainerBERT:
    def __init__(self, path_to_log: str, val_frequency: int = 5):
        self._device = DEVICE
        self._val_frequency = val_frequency
        self._path_to_log = path_to_log
        self._metrics_tracker = MetricsTracker()
        self._train_loss, self._val_loss = LossTracker(), LossTracker()
        self._best_val_loss, self._best_metrics = 100.0, self._metrics_tracker.get_best_metrics()
        self._path_to_metrics = str(os.path.join(path_to_log, "metrics.csv"))

    def __to_device(self, x: Dict, y: Tensor):
        return {k1: {k2: v2.to(self._device) for k2, v2 in v1.items()} for k1, v1 in x.items()}, y.to(self._device)

    def _train_epoch(self, model: ModelBERT, data: DataLoader, epoch: int, *args, **kwargs):
        for i, (x, y) in enumerate(data):
            if not "source" in x.keys():
                continue
            x, y = self.__to_device(x, y)
            tl = model.optimize(x, y)
            self._train_loss.update(tl)
            if i % 5 == 0:
                print("[ Epoch: {} - Batch: {} ] | Loss: {:.4f} ".format(epoch + 1, i, tl))

    def _eval_epoch(self, model: ModelBERT, data: DataLoader):
        for i, (x, y) in enumerate(data):
            x, y = self.__to_device(x, y)
            pred = model.predict(x, y)
            self._metrics_tracker.update(pred.logits, y.detach().numpy())
            vl = model.get_loss(pred).item()
            if i % 5 == 0:
                print("[ Batch: {} ] | Loss: {:.4f} ]".format(i, vl))

    def _check_metrics(self):
        """ Computes, prints and logs the current metrics using the metrics tracker """
        epoch_metrics = self._metrics_tracker.compute_metrics()
        self._print_metrics(epoch_metrics)
        self._log_metrics(epoch_metrics)

    def _log_metrics(self, metrics: Dict):
        log_data = pd.DataFrame({"train_loss": [self._train_loss.avg], "val_loss": [self._val_loss.avg],
                                 **{"best_" + k: [v] for k, v in self._best_metrics.items()},
                                 **{k: [v] for k, v in metrics.items()}})
        header = log_data.keys() if not os.path.exists(self._path_to_metrics) else False
        log_data.to_csv(self._path_to_metrics, mode='a', header=header, index=False)

    def _print_metrics(self, metrics: Dict):
        for mn, mv in metrics.items():
            print((" {} " + "".join(["."] * (15 - len(mn))) + " : {:.4f} (Best: {:.4f})")
                  .format(mn.capitalize(), mv, self._best_metrics[mn]))

    def train(self, model: ModelBERT, training_set: DataLoader, test_set: DataLoader, lr: float, epochs: int):
        """
        Trains the given model (a PyTorch nn.Module) for "epochs" epochs
        :param model: the model to be trained (a PyTorch nn.Module)
        :param training_set: the data loader containing the training data
        :param test_set: the data loader containing the validation/test data
        :param lr: a learning rate as base value for the optimizer
        :param epochs: the number of epochs the model should be trained for
        """
        model.log_strategy(self._path_to_log)
        model.set_optimizer(lr)

        for epoch in range(epochs):

            model.train_mode()
            self._train_loss.reset()
            self.print_heading("training", epoch, epochs)

            start = time()
            self._train_epoch(model, training_set, epoch)
            self.print_train_performance(train_time=time() - start)

            if epoch % self._val_frequency == 0:
                model.eval_mode()
                self._val_loss.reset()
                self._reset_metrics_tracker()
                self.print_heading("validating", epoch, epochs)

                start = time()
                with torch.no_grad():
                    self._eval_epoch(model, test_set)
                self.print_val_performance(val_time=time() - start)

                self._check_metrics()
                self._check_if_best_model(model)

    def _reset_metrics_tracker(self):
        """ Reset the metrics_tracker(s) zeroing out the running values """
        self._metrics_tracker.reset()

    def _check_if_best_model(self, model: ModelBERT):
        """
        Checks whether the provides model is the new best model based on the values of the validation loss.
        If yes, updates the best metrics and validation loss (as side effect) and saves the model to file
        :param model: the model to be possibly saved as new best model
        """
        if 0 < self._val_loss.avg < self._best_val_loss:
            self._best_val_loss = self._val_loss.avg
            self._best_metrics = self._metrics_tracker.update_best_metrics()
            print("\n -> Saving new best model...")
            model.save(self._path_to_log)

    def print_train_performance(self, train_time: float):
        """
        Prints the training time/loss for the most recent epoch
        :param train_time: the training time for the most recent epoch
        """
        print("\n" + SEPARATOR["stars"])
        print(" Train Time ... : {:.4f}".format(train_time))
        print(" Train Loss ... : {:.4f}".format(self._train_loss.avg))
        print(SEPARATOR["stars"])

    def print_val_performance(self, val_time: float):
        """
        Prints the validation time/loss for the most recent epoch
        :param val_time: the validation time for the most recent epoch
        """
        print("\n" + SEPARATOR["stars"])
        print(" Val Time ... : {:.4f}".format(val_time))
        print(" Val Loss ... : {:.4f}".format(self._val_loss.avg))
        print(SEPARATOR["stars"] + "\n")

    @staticmethod
    def print_heading(mode: str, epoch: int, epochs: int):
        print("\n" + SEPARATOR["dashes"])
        print("\t\t {} epoch {}/{}".format(mode.upper(), epoch + 1, epochs))
        print(SEPARATOR["dashes"] + "\n")

# Perform CV

In [22]:
make_deterministic(0)
lr, epochs, log_frequency = 0.0001, 5, 1

log_dir = "{}".format(time())
path_to_log = os.path.join("logs", log_dir)
os.makedirs(path_to_log)

for fold_num in range(5):
    print("\n Loading data for fold '{}'".format(fold_num + 1))
    train_loader, test_loader = DataHandler().train_test_loaders(fold_num)
    model = ModelBERT()

    print("\n" + SEPARATOR["dashes"])
    print("\t\t Training fold {}".format(fold_num + 1))
    print(SEPARATOR["dashes"] + "\n")

    trainer = TrainerBERT(path_to_log, log_frequency)
    trainer.train(model, train_loader, test_loader, lr, epochs)


 Loading data for fold '1'


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



----------------------------------------------------------------------------------------------------
		 Training fold 1
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
		 TRAINING epoch 1/5
----------------------------------------------------------------------------------------------------

[ Epoch: 1 - Batch: 0 ] | Loss: 5.3991 


KeyboardInterrupt: 