In [1]:
#INSTALL LIBRARIES------------------------------------------
!pip install transformers scikit-learn datasets wandb

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting wandb
  Downloading wandb-0.17.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from 

In [2]:
#IMPORTS-----------------------------
from pprint import pprint
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from torch.nn import Linear, ReLU
import pdb
import numpy as np, torch, random as rnd, torch.nn as nn, wandb
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import sys, os, json
from transformers import AutoModelForQuestionAnswering
from torch.nn.functional import cosine_similarity
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

from transformers import AutoTokenizer
import os, time



In [3]:


# adversarial = load_dataset("iperbole/adversarial_fever_nli")["test"]

# ds = load_dataset("tommasobonomo/sem_augmented_fever_nli")

# training_set = ds["train"]

# validation_set = ds["validation"]

# test_set = ds["test"]

# pprint(adversarial[0])
# pprint(training_set[0])

In [4]:

# import nltk
# from nltk.corpus import stopwords

# nltk.download('stopwords')
# print(stopwords.words('english'))



In [5]:
# f1 = "test per vedere come va"
# f2 = "questa è una prova"

# tokenized = tokenizer(f1+ tokenizer.eos_token + f2, return_tensors='pt', padding='max_length', max_length=40, return_token_type_ids=True)

# print(tokenized)
# print(tokenized["input_ids"].shape)
# print(type(tokenized))

# out = model(**tokenized)
# print(out['last_hidden_state'].shape)

# print(out.last_hidden_state.mean(dim=-1).squeeze().shape)

In [6]:
# Function to print a progress bar
def print_progress_bar(percentuale: float, lunghezza_barra: int = 30, text: str="") -> None:
    blocchi_compilati = int(lunghezza_barra * percentuale)
    barra = "[" + "=" * (blocchi_compilati - 1) + ">" + " " * (lunghezza_barra - blocchi_compilati) + "]"
    sys.stdout.write(f"\r{barra} {percentuale * 100:.2f}% complete " + text)
    sys.stdout.flush()

In [7]:
class NLIDataset(Dataset):



    def __init__(self, data, file_name, adversarial = False):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.encode_labels = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaModel.from_pretrained('roberta-base').to(self.device)
        self.distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
        self.file_name = file_name
        self.adversarial = adversarial
        self.preprocess_function(data)
        self.tokenizer = None
        self.model = None



    def preprocess_function(self, examples):
        premises = []
        answers = []
        hypothesis = []
        similarities = {}
        ordered_similarities = []
        file_exists = os.path.isfile("data/" + self.file_name)
        if file_exists:
            with open("data/" + self.file_name, "r") as f:
                similarities = json.load(f)
        # Utilizza un ciclo for per popolare le tre liste
        #breakpoint()
        for i,example in enumerate(examples):

            print_progress_bar(i / len(examples), text=" | preprocessing")
            premises.append(example["premise"].strip())
            answers.append(self.encode_labels[example["label"]] )
            hypothesis.append(example["hypothesis"].strip())
            if not file_exists:
                s1 = self.embed_sentence(example["premise"].strip())
                s2 = self.embed_sentence(example["hypothesis"].strip())
                if self.adversarial:


                    similarities[str(example["cid"]) + example["hypothesis"]] = cosine_similarity(s1, s2).item()
                    ordered_similarities.append(similarities[str(example["cid"]) + example["hypothesis"]])
                else:
                    similarities[example["id"]] = cosine_similarity(s1, s2).item()
                    ordered_similarities.append(similarities[example["id"]])
            elif self.adversarial:
                ordered_similarities.append(similarities[str(example["cid"]) + example["hypothesis"]])
            else:
                ordered_similarities.append(similarities[example["id"]])
        #breakpoint()
        inputs = self.distilbert_tokenizer(
            hypothesis,
            premises,
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
            return_tensors="pt"
        )
        if not file_exists:
            if not os.path.exists("data"):
                os.makedirs("data")
            with open("data/" + self.file_name, "w") as f:
                json.dump(similarities, f, indent=4)
        inputs["label"] = torch.tensor(answers)
        inputs["similarity"] = torch.tensor(ordered_similarities)
        self.data = inputs


    def embed_sentence(self, sentence):
        # Tokenizza la frase
        inputs = self.tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        # Ottieni gli embedding dal modello
        with torch.no_grad():
            outputs = self.model(**inputs.to(self.device))
        # Usa l'output del modello come embedding (puoi usare altri livelli o combinazioni se preferisci)
        # Prendi il vettore medio (puoi anche scegliere il vettore della [CLS] token, ecc.)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings

    def __len__(self):
        return len(self.data["input_ids"])

    def __getitem__(self, idx):
        return self.data["input_ids"][idx], self.data["attention_mask"][idx], self.data["label"][idx], self.data["similarity"][idx]

    def collate(self, batch):
        #breakpoint()
        x = []
        attention_mask = []
        y = []
        z = []
        for x_batch, attention_mask_batch, y_batch, z_batch in batch:
            x.append(x_batch)
            attention_mask.append(attention_mask_batch)
            y.append(y_batch)
            z.append(z_batch)

        x = torch.stack(x)
        attention_mask = torch.stack(attention_mask)
        y = torch.stack(y)
        z = torch.stack(z)
        #breakpoint()
        # x = pad_sequence(x, batch_first=True)
        # attention_mask = pad_sequence(attention_mask, batch_first=True)
        return x.to(self.device),attention_mask.to(self.device), y.to(self.device), z.to(self.device)



    def get_dataloader(self, batch_size):
        return DataLoader(self, batch_size=batch_size, shuffle=True, collate_fn = self.collate)

















In [25]:
# prompt: create a torch model using roberta and a linear layer

class RobertaClassifier(nn.Module):
  def __init__(self, use_similarity, num_labels=3):
    super(RobertaClassifier, self).__init__()
    self.distilbert =  AutoModel.from_pretrained("distilbert/distilbert-base-uncased")
    self.linear = nn.Linear(self.distilbert.config.hidden_size+1, num_labels)
    self.use_similarity = use_similarity

  def freeze(self, epoch):
    if epoch == 2:
        freeze_until_layer = 99999
    else:
        freeze_until_layer = None  # Non congela nulla

    # Congela i layer di DistilBERT
    if freeze_until_layer is not None:
        for idx, param in enumerate(self.distilbert.parameters()):
            if idx < freeze_until_layer:
                param.requires_grad = False
            else:
                param.requires_grad = True



  def forward(self, input_ids, attention_mask, similarities):

   # breakpoint()
    if not self.use_similarity:
        similarities = torch.zeros(input_ids.shape[0]).to('cuda' if torch.cuda.is_available() else 'cpu')

    outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
    #breakpoint()

    pooled_output = torch.cat((outputs.last_hidden_state[:, 0, :], similarities.unsqueeze(1)), dim=1)
    # outputs.last_hidden_state.mean(dim=-1)

    logits = self.linear(pooled_output)

    return logits


In [20]:


class Trainer():

    def __init__(self, model,train_dataloader, validation_dataloader, optimizer, loss_function, device, test_dataloader=None):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.validation_dataloader = validation_dataloader
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.device = device
        self.test_dataloader = test_dataloader


    @staticmethod
    def evaluation_parameters(y_true, y_pred):
        #breakpoint()
        y_pred = np.argmax(y_pred, axis=1)
        cm = confusion_matrix(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted')
        recall = recall_score(y_true, y_pred, average='weighted')
        f1 = f1_score(y_true, y_pred, average='weighted')
        accuracy = accuracy_score(y_true, y_pred)
        return cm, precision, recall, f1, accuracy

    @staticmethod
    def format_time_delay(seconds):
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        seconds = seconds % 60
        return hours, minutes, seconds


    def train(self, epochs: int, use_wandb: bool = False, config: dict = {}, name: str="", target_f1: float=0.0):
        start_time = time.time()
        best_model = None
        save = False
        if use_wandb:
            wandb.init(
                # Set the project where this run will be logged
                project="nlphw2",
                name=name,
                # Track hyperparameters and run metadata
                config=config
            )
        validation_loss, precision, recall, f1, accuracy = self.validate(use_wandb)
        total_loss = validation_loss
        if use_wandb:
                wandb.log({"validation_loss": validation_loss,
                      "precision": precision,
                      "recall": recall,
                      "f1": f1,
                      "accuracy": accuracy,
                      "train_loss": total_loss / len(self.train_dataloader)})
        for epoch in range(epochs):

            time_delay = time.time() - start_time
            hours, minutes, seconds = self.format_time_delay(time_delay)
            print(f"\nTempo trascorso: {hours} ore, {minutes} minuti, {seconds} secondi")
            self.model.freeze(epoch)
            self.model.train()  # Set the model to training mode
            total_loss = 0
            #breakpoint()
            for i, batch in enumerate(self.train_dataloader):
                print_progress_bar(i / len(self.train_dataloader), text=f" | training epoch {epoch}")
                # Get the inputs and targets from the batch
                inputs, mask, targets, similarities = batch

                # Zero the gradients
                self.optimizer.zero_grad()
                # Forward pass
                outputs = self.model(inputs, mask, similarities)
                #print("outputs = ", outputs,"\ntargets = ", targets)
                #breakpoint()
                # Compute loss
                loss = self.loss_function(outputs, targets)
                #print(loss)
                # Backward pass and optimize
                loss.backward()
                self.optimizer.step()
                # Accumulate the total loss
                total_loss += loss.item()

            # Print the average loss for this epoch
            validation_loss, precision, recall, f1, accuracy = self.validate(use_wandb)
            if f1 > target_f1:
                best_model = self.model.state_dict()
                target_f1 = f1
                save = True
            if use_wandb:
                wandb.log({"validation_loss": validation_loss,
                      "precision": precision,
                      "recall": recall,
                      "f1": f1,
                      "accuracy": accuracy,
                      "train_loss": total_loss / len(self.train_dataloader)})
        if save:
            torch.save(best_model, name + f'-{target_f1}.pth')
        print(target_f1)
        if use_wandb:
            wandb.finish()



    def validate(self, use_wandb: bool = False, test=False):
        dataloader = self.test_dataloader if test else self.validation_dataloader
        if dataloader is None:
            print("empty dataloader!")
            exit(1)
        self.model.eval()  # Set the model to evaluation mode
        total_loss = 0
        all_predictions = torch.tensor([])
        all_targets = torch.tensor([])
        with torch.no_grad():  # Do not calculate gradients
            for i, batch in enumerate(self.validation_dataloader):
                print_progress_bar(i / len(dataloader), text=" | validation")
                # Get the inputs and targets from the batch
                inputs, mask, targets, similarities  = batch

                # Forward pass
                outputs = self.model(inputs, mask, similarities)
                # Compute loss
                #breakpoint()
                loss = self.loss_function(outputs, targets)
                # Accumulate the total loss
                total_loss += loss.item()
                # Store predictions and targets
                all_predictions = torch.cat((all_predictions, outputs.squeeze().round().cpu()))
                all_targets = torch.cat((all_targets, targets.cpu()))
        validation_loss = total_loss / len(self.validation_dataloader)
        #breakpoint()
        cm, precision, recall, f1, accuracy = self.evaluation_parameters(all_targets, all_predictions)
        return validation_loss, precision, recall, f1, accuracy




In [10]:
class BaselineStratifiedModel(nn.Module):

    def __init__(self, len0, len1):
        super(BaselineStratifiedModel, self).__init__()
        self.p = len0/(len0+len1)

    def forward(self, x):
        return torch.tensor([0 if np.random.rand() < self.p else 1 for _ in range(x[0].shape[0])], dtype=torch.float)



In [11]:
new_seed = 108
def set_seed(seed):
    np.random.seed(seed)
    rnd.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True # Se stai usando GPU
    return seed, seed+1

In [12]:


# Imposta il seed per avere riproducibilità

seed, new_seed = set_seed(new_seed)

In [13]:

adversarial = load_dataset("iperbole/adversarial_fever_nli")["test"]

ds = load_dataset("tommasobonomo/sem_augmented_fever_nli")

training_set = ds["train"]

validation_set = ds["validation"]

test_set = ds["test"]

wandb.login(key='aaf831dabc88d936d4e6b439b798bb4cb42814ea')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/337 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.25M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51086 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2288 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2287 [00:00<?, ? examples/s]

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [14]:
!git clone https://github.com/monteleone-1883922/hw2_nlp.git
os.chdir("hw2_nlp")
!git checkout huggingFaceBase
!mv data ./../data
os.chdir("..")
!rm -rf hw2_nlp


Cloning into 'hw2_nlp'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 81 (delta 46), reused 14 (delta 3), pack-reused 0[K
Receiving objects: 100% (81/81), 738.24 KiB | 12.30 MiB/s, done.
Resolving deltas: 100% (46/46), done.
Branch 'huggingFaceBase' set up to track remote branch 'huggingFaceBase' from 'origin'.
Switched to a new branch 'huggingFaceBase'


In [15]:
train_dataset = NLIDataset(training_set, "train_similarities.json")

validation_dataset = NLIDataset(validation_set, "validation_similarities.json")

test_dataset = NLIDataset(test_set, "test_similarities.json")

adversarial_dataset = NLIDataset(adversarial, "adversarial_similarities.json", adversarial=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




In [16]:
# prompt: genera i dataloader

train_dataloader = train_dataset.get_dataloader(batch_size=50)
validation_dataloader = validation_dataset.get_dataloader(batch_size=50)
test_dataloader = test_dataset.get_dataloader(batch_size=32)
adversarial_dataloader = adversarial_dataset.get_dataloader(batch_size=32)


In [26]:

model = RobertaClassifier(False)

trainer = Trainer(model, train_dataloader, validation_dataloader, torch.optim.Adam(model.parameters(), lr=1e-5), nn.CrossEntropyLoss(),'cuda' if torch.cuda.is_available() else 'cpu', test_dataloader)



In [18]:
trainer.validate(test=False)



  _warn_prf(average, modifier, msg_start, len(result))


(1.107908668725387,
 0.23084976293831394,
 0.33916083916083917,
 0.18635372958499985,
 0.33916083916083917)

In [27]:

trainer.train(6, use_wandb=True, name="base_1e-5_"+ str(seed))




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁██
f1,▁██
precision,▁██
recall,▁██
train_loss,▁█▆
validation_loss,█▁▁

0,1
accuracy,0.73077
f1,0.71741
precision,0.73037
recall,0.73077
train_loss,0.36482
validation_loss,0.7642


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112953099998574, max=1.0…

Tempo trascorso: 0.0 ore, 0.0 minuti, 43.93702507019043 secondi
[>                              ] 0.00% complete  | training epoch 0

  _warn_prf(average, modifier, msg_start, len(result))


Tempo trascorso: 0.0 ore, 28.0 minuti, 12.716667652130127 secondi
Tempo trascorso: 0.0 ore, 55.0 minuti, 41.910176038742065 secondi
Tempo trascorso: 1.0 ore, 7.0 minuti, 6.8750526905059814 secondi
Tempo trascorso: 1.0 ore, 18.0 minuti, 29.66701030731201 secondi
Tempo trascorso: 1.0 ore, 29.0 minuti, 53.00909161567688 secondi


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁██████
f1,▁██████
precision,▁██████
recall,▁██████
train_loss,▁█▆▅▅▅▅
validation_loss,█▁▁▂▂▂▂

0,1
accuracy,0.7382
f1,0.72746
precision,0.73668
recall,0.7382
train_loss,0.2977
validation_loss,0.73845


In [None]:
# del training_set

# dataset = NLIDataset(validation_set)


# dataloader = dataset.get_dataloader(batch_size=32, pos_num=1)

# model = RobertaClassifier()

# batch = next(iter(dataloader))

# model(batch[0], batch[1], batch[3])