In [1]:
!pip install wandb fasttext plotly

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.45.0-py2.py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import fasttext.util
import plotly.express as px
import sys
import wandb
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.functional import sigmoid
import pdb
from torch.optim import Adam


In [4]:
# "neutrale", "odio" aaf831dabc88d936d4e6b439b798bb4cb42814ea

ROOT_PATH = "/content/drive/MyDrive/uni/nlp/nlp2024-hw1-b"



In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmonteleone-1883922[0m ([33mmonteleone[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:


class HaSpeeDe_Dataset(Dataset):


    def __init__(self, data_path: str, data: list[tuple[list, int]]=None, use_embeddings: bool=False, stopwords_file_path: str="", device="cpu") -> None:
        self.device = device
        if data is not None:
            self.data = data
        else:
            if stopwords_file_path != "":
                with open(stopwords_file_path, 'r', encoding="UTF8") as f:
                    stopwords = f.readlines() #controllare carattere di andare a capo
            if use_embeddings:
                fasttext.util.download_model('it', if_exists='ignore')
                embeddings = fasttext.load_model('cc.it.300.bin')
            self.data = []
            with open(data_path, 'r', encoding="UTF8") as f:
                for line in f:
                    item = json.loads(line)
                    sentence = item['text'].split()
                    if stopwords_file_path != "":
                        sentence = [word for word in sentence if word not in stopwords]
                    if use_embeddings:
                        embedded_sentence = []
                        for word in sentence:
                            embedded_sentence.append(embeddings.get_word_vector(word))
                        sentence = embedded_sentence
                    self.data.append((sentence, item['label']))

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> tuple[list, int]:
        return self.data[idx]

    def split(self, prc: float) -> list[tuple[list, int]]:
        validation_size = int(prc * len(self.data))
        train_size = len(self.data) - validation_size
        validation_data, self.data = torch.utils.data.random_split(self.data, [validation_size, train_size])
        return validation_data


    def collate(self, batch: list[tuple[list, int]]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        texts, labels = zip(*batch)
        lens = [len(text) for text in texts]
        texts = pad_sequence([torch.tensor(text) for text in texts], batch_first=True)
        return texts.to(self.device), torch.tensor(labels, dtype=torch.float).to(self.device), torch.tensor(lens).to(self.device)

    def get_dataloader(self, batch_size: int, shuffle: bool) -> DataLoader:
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate)


    def print_data_analysis(self):
        y= [0,0]
        for el in self.data:
            if el[1] == 1:
                y[1] += 1
            else:
                y[0] += 1
        self.neutral_count = y[0]
        self.hateful_count = y[1]
        fig = px.bar(x=["neutrale", "odio"], y=y)
        fig.show()






In [6]:
# Function to print a progress bar
def print_progress_bar(percentuale: float, lunghezza_barra: int = 30) -> None:
    blocchi_compilati = int(lunghezza_barra * percentuale)
    barra = "[" + "=" * (blocchi_compilati - 1) + ">" + " " * (lunghezza_barra - blocchi_compilati) + "]"
    sys.stdout.write(f"\r{barra} {percentuale * 100:.2f}% complete")
    sys.stdout.flush()

In [15]:


class Trainer():

    def __init__(self, model,train_dataloader, validation_dataloader, optimizer, loss_function, device):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.validation_dataloader = validation_dataloader
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.device = device


    @staticmethod
    def evaluation_parameters(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred).ravel()
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        return cm, precision, recall, f1, accuracy


    def train(self, epochs: int, use_wandb: bool = False, config: dict = {}, name: str=""):
        if use_wandb:
            wandb.init(
                # Set the project where this run will be logged
                project="nlp-hw-1b",
                name=name,
                # Track hyperparameters and run metadata
                config=config
            )
        for epoch in range(epochs):
            self.model.train()  # Set the model to training mode
            total_loss = 0
            #breakpoint()
            for i, batch in enumerate(self.train_dataloader):
                print_progress_bar(i / len(self.train_dataloader))
                # Get the inputs and targets from the batch
                inputs, targets, lens = batch
                # Zero the gradients
                self.optimizer.zero_grad()
                # Forward pass
                outputs = self.model((inputs, lens))
                breakpoint()
                # Compute loss
                loss = self.loss_function(outputs, targets)
                # Backward pass and optimize
                loss.backward()
                self.optimizer.step()
                # Accumulate the total loss
                total_loss += loss.item()

            # Print the average loss for this epoch
            if use_wandb:
                wandb.log({"train_loss": total_loss / len(self.train_dataloader)})
            self.validate(use_wandb)


    def validate(self, use_wandb: bool = False):
        self.model.eval()  # Set the model to evaluation mode
        total_loss = 0
        all_predictions = torch.tensor([], device=self.device)
        all_targets = torch.tensor([], device=self.device)
        with torch.no_grad():  # Do not calculate gradients
            for i, batch in enumerate(self.validation_dataloader):
                print_progress_bar(i / len(self.validation_dataloader))
                # Get the inputs and targets from the batch
                inputs, targets, lens = batch
                # Forward pass
                outputs = self.model((inputs,lens))
                # Compute loss
                loss = self.loss_function(outputs, targets)
                # Accumulate the total loss
                total_loss += loss.item()
                # Store predictions and targets
                all_predictions = torch.cat((all_predictions, outputs.squeeze().round()))
                all_targets = torch.cat((all_targets, targets))
        validation_loss = total_loss / len(self.validation_dataloader)
        precision = precision_score(all_targets, all_predictions)
        recall = recall_score(all_targets, all_predictions)
        f1 = f1_score(all_targets, all_predictions)
        accuracy = accuracy_score(all_targets, all_predictions)
        if use_wandb:

            wandb.log({"validation_loss": validation_loss,
                      "precision": precision,
                      "recall": recall,
                      "f1": f1,
                      "accuracy": accuracy})

        return validation_loss, precision, recall, f1, accuracy




In [27]:


class BaselineStratifiedModel(nn.Module):

    def __init__(self, len0, len1):
        super(BaselineStratifiedModel, self).__init__()
        self.p = len0/(len0+len1)

    def forward(self, x):
        return torch.tensor([0 if np.random.rand() < self.p else 1 for _ in range(x[0].shape[0])], dtype=torch.float)



class BaselineSimpleModel(nn.Module):

    def __init__(self, input_size, output_size):
        super(BaselineSimpleModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        # for param in self.linear.parameters():
        #     param.requires_grad = True

    def forward(self, x):
        #breakpoint()
        seq, lens = x
        packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)
        data = self.linear(packed.data).squeeze()
        data = sigmoid(data)
        packed = torch.nn.utils.rnn.PackedSequence(data, packed.batch_sizes, packed.sorted_indices, packed.unsorted_indices)
        seq, lens = pad_packed_sequence(packed, batch_first=True)
        seq_mean = seq.sum(dim=1) / lens.float() #torch.tensor([sum(el)/len(el) for el in seq])
        breakpoint()
        return seq_mean





In [9]:

train_dataset = HaSpeeDe_Dataset(ROOT_PATH + "/train-taskA.jsonl", use_embeddings=True)
train_dataset.print_data_analysis()

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz





In [11]:
val_data = train_dataset.split(0.2)
val_dataset = HaSpeeDe_Dataset("", data=val_data)

In [28]:
model = BaselineSimpleModel(300,1)

train_loader = train_dataset.get_dataloader(64, False)
val_loader = val_dataset.get_dataloader(64, False)
trainer = Trainer(model,train_loader, val_loader, Adam(model.parameters()), nn.BCELoss(), "cpu")

In [None]:

baseline_model = BaselineStratifiedModel(train_dataset.neutral_count, train_dataset.hateful_count)


trainer_baseline = Trainer(baseline_model,train_loader, val_loader, Adam(model.parameters()), nn.BCELoss(), "cpu")




In [None]:

validation_loss, precision, recall, f1, accuracy = trainer_baseline.validate()





In [29]:


trainer.train(20, use_wandb=True , name="1st-simple-baseline")


[>                              ] 0.00% complete> [0;32m<ipython-input-27-9cf12581d01a>[0m(30)[0;36mforward[0;34m()[0m
[0;32m     28 [0;31m        [0mseq_mean[0m [0;34m=[0m [0mseq[0m[0;34m.[0m[0msum[0m[0;34m([0m[0mdim[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m [0;34m/[0m [0mlens[0m[0;34m.[0m[0mfloat[0m[0;34m([0m[0;34m)[0m [0;31m#torch.tensor([sum(el)/len(el) for el in seq])[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     29 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 30 [0;31m        [0;32mreturn[0m [0mseq_mean[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     31 [0;31m[0;34m[0m[0m
[0m[0;32m     32 [0;31m[0;34m[0m[0m
[0m
ipdb> c
> [0;32m<ipython-input-15-ebd120a98eb7>[0m(45)[0;36mtrain[0;34m()[0m
[0;32m     43 [0;31m                [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     44 [0;31m                [0;31m# Compute loss[0m[0;34m[0m[0;34m

In [None]:


print("Validation Loss:", validation_loss)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Validation Loss: 46.73604254289107
Precision: 0.4165103189493433
Recall: 0.4014466546112116
F1 Score: 0.4088397790055248
Accuracy: 0.5303584491587418


In [None]:
wandb.finish()