In [1]:
!pip install wandb fasttext plotly



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import fasttext.util
import plotly.express as px
import sys
import wandb
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.functional import sigmoid
import pdb
from torch.optim import Adam, AdamW
import random as rnd
import nltk
nltk.download('punkt')
import torch.nn.init as init

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def set_seed(seed):
    np.random.seed(seed)
    rnd.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Se stai usando GPU

# Imposta il seed per avere riproducibilità
seed = 42
set_seed(seed)

In [5]:
# "neutrale", "odio" aaf831dabc88d936d4e6b439b798bb4cb42814ea

ROOT_PATH = "/content/drive/MyDrive/uni/nlp/nlp2024-hw1-b"



In [6]:
wandb.login(key='aaf831dabc88d936d4e6b439b798bb4cb42814ea')

[34m[1mwandb[0m: Currently logged in as: [33mmonteleone-1883922[0m ([33mmonteleone[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:


class HaSpeeDe_Dataset(Dataset):


    def __init__(self, data_path: str, data: list[tuple[list, int]]=None, use_embeddings: bool=False, stopwords_file_path: str="", device="cpu", encoder=None) -> None:
        self.device = device
        use_encoder = True
        if encoder is None:
            encoder = {}
            use_encoder = False
            j=0
            new_embeddings = {}
        if data is not None:
            self.data = data
        else:
            stopwords = []
            if stopwords_file_path != "":
                with open(stopwords_file_path, 'r', encoding="UTF8") as f:
                    stopwords = f.readlines() #controllare carattere di andare a capo
            if use_embeddings:
                #fasttext.util.download_model('it', if_exists='ignore')
                embeddings = fasttext.load_model(ROOT_PATH + '/cc.it.300.bin')
            self.data = []
            with open(data_path, 'r', encoding="UTF8") as f:
                for line in f:
                    item = json.loads(line)
                    sentence = nltk.word_tokenize(item['text'], language='italian')
                    filtered_sentence = []
                    i=0
                    while i < len(sentence):
                        word = sentence[i]
                        if (word == "#" or word == "@") and i+1 < len(sentence):
                            word = word + sentence[i+1]
                            i+=1
                        if stopwords_file_path == "" or word not in stopwords:
                            filtered_sentence.append(word)
                        i+=1

                    if use_embeddings:
                        embedded_sentence = []
                        for word in sentence:
                            if word not in encoder and not use_encoder:
                                encoder[word] = j
                                j+=1
                                new_embeddings[j] = embeddings.get_word_vector(word)
                            elif word not in encoder:
                                embedded_sentence.append(encoder["<UNK>"])
                            else:
                                embedded_sentence.append(encoder[word])
                        sentence = embedded_sentence
                    self.data.append((sentence, item['label']))
            if not use_encoder:
                encoder["<UNK>"] = j
                self.encoding = encoder
                new_embeddings[j] = embeddings.get_word_vector("<UNK>")
                self.embeddings = new_embeddings

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> tuple[list, int]:
        return self.data[idx]

    def split(self, prc: float) -> list[tuple[list, int]]:
        validation_size = int(prc * len(self.data))
        train_size = len(self.data) - validation_size
        validation_data, self.data = torch.utils.data.random_split(self.data, [validation_size, train_size])
        return validation_data


    def collate(self, batch: list[tuple[list, int]]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        texts, labels = zip(*batch)
        lens = [len(text) for text in texts]
        texts = pad_sequence([torch.tensor(text) for text in texts], batch_first=True)
        return texts.to(self.device), torch.tensor(labels, dtype=torch.float).to(self.device), torch.tensor(lens).to(self.device)

    def get_dataloader(self, batch_size: int, shuffle: bool) -> DataLoader:
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate)


    def print_data_analysis(self):
        y= [0,0]
        for el in self.data:
            if el[1] == 1:
                y[1] += 1
            else:
                y[0] += 1
        self.neutral_count = y[0]
        self.hateful_count = y[1]
        fig = px.bar(x=["neutrale", "odio"], y=y)
        fig.show()






In [8]:
# Function to print a progress bar
def print_progress_bar(percentuale: float, lunghezza_barra: int = 30, text: str="") -> None:
    blocchi_compilati = int(lunghezza_barra * percentuale)
    barra = "[" + "=" * (blocchi_compilati - 1) + ">" + " " * (lunghezza_barra - blocchi_compilati) + "]"
    sys.stdout.write(f"\r{barra} {percentuale * 100:.2f}% complete " + text)
    sys.stdout.flush()

In [9]:


class Trainer():

    def __init__(self, model,train_dataloader, validation_dataloader, optimizer, loss_function, device, test_dataloader=None):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.validation_dataloader = validation_dataloader
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.device = device
        self.test_dataloader = test_dataloader


    @staticmethod
    def evaluation_parameters(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred).ravel()
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        return cm, precision, recall, f1, accuracy


    def train(self, epochs: int, use_wandb: bool = False, config: dict = {}, name: str="", target_f1: float=0.0):
        best_model = None
        save = False
        if use_wandb:
            wandb.init(
                # Set the project where this run will be logged
                project="nlp-hw-1b",
                name=name,
                # Track hyperparameters and run metadata
                config=config
            )
        for epoch in range(epochs):
            self.model.train()  # Set the model to training mode
            total_loss = 0
            #breakpoint()
            for i, batch in enumerate(self.train_dataloader):
                print_progress_bar(i / len(self.train_dataloader), text=f"| training epoch {epoch}")
                # Get the inputs and targets from the batch
                inputs, targets, lens = batch

                # Zero the gradients
                self.optimizer.zero_grad()
                # Forward pass
                outputs = self.model((inputs, lens))
                #print("outputs = ", outputs,"\ntargets = ", targets)
                #breakpoint()
                # Compute loss
                loss = self.loss_function(outputs, targets)
                #print(loss)
                # Backward pass and optimize
                loss.backward()
                self.optimizer.step()
                # Accumulate the total loss
                total_loss += loss.item()

            # Print the average loss for this epoch
            validation_loss, precision, recall, f1, accuracy = self.validate(use_wandb)
            if f1 > target_f1:
                best_model = self.model.state_dict()
                target_f1 = f1
                save = True
            if use_wandb:
                wandb.log({"validation_loss": validation_loss,
                      "precision": precision,
                      "recall": recall,
                      "f1": f1,
                      "accuracy": accuracy,
                      "train_loss": total_loss / len(self.train_dataloader)})
        if save:
            torch.save(best_model, name + f'{target_f1}.pth')
        print(target_f1)
        if use_wandb:
            wandb.finish()



    def validate(self, use_wandb: bool = False, test=False):
        dataloader = self.test_dataloader if test else self.validation_dataloader
        if dataloader is None:
            print("empty dataloader!")
            exit(1)
        self.model.eval()  # Set the model to evaluation mode
        total_loss = 0
        all_predictions = torch.tensor([])
        all_targets = torch.tensor([])
        with torch.no_grad():  # Do not calculate gradients
            for i, batch in enumerate(self.validation_dataloader):
                print_progress_bar(i / len(dataloader), text="| validation")
                # Get the inputs and targets from the batch
                inputs, targets, lens = batch

                # Forward pass
                outputs = self.model((inputs,lens))
                # Compute loss
                #breakpoint()
                loss = self.loss_function(outputs, targets)
                # Accumulate the total loss
                total_loss += loss.item()
                # Store predictions and targets
                all_predictions = torch.cat((all_predictions, outputs.squeeze().round().cpu()))
                all_targets = torch.cat((all_targets, targets.cpu()))
        validation_loss = total_loss / len(self.validation_dataloader)
        #breakpoint()
        precision = precision_score(all_targets, all_predictions)
        recall = recall_score(all_targets, all_predictions)
        f1 = f1_score(all_targets, all_predictions)
        accuracy = accuracy_score(all_targets, all_predictions)
        return validation_loss, precision, recall, f1, accuracy




In [10]:


class BaselineStratifiedModel(nn.Module):

    def __init__(self, len0, len1):
        super(BaselineStratifiedModel, self).__init__()
        self.p = len0/(len0+len1)

    def forward(self, x):
        return torch.tensor([0 if np.random.rand() < self.p else 1 for _ in range(x[0].shape[0])], dtype=torch.float)



class BaselineSimpleModel(nn.Module):

    def __init__(self, input_size, output_size):
        super(BaselineSimpleModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        # for param in self.linear.parameters():
        #     param.requires_grad = True

    def forward(self, x):
        #breakpoint()
        seq, lens = x
        packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)
        data = self.linear(packed.data).squeeze()
        data = sigmoid(data)
        packed = torch.nn.utils.rnn.PackedSequence(data, packed.batch_sizes, packed.sorted_indices, packed.unsorted_indices)
        seq, lens = pad_packed_sequence(packed, batch_first=True)
        seq_mean = seq.sum(dim=1) / lens.float() #torch.tensor([sum(el)/len(el) for el in seq])
        #breakpoint()
        return seq_mean





In [11]:

class HateDetectionModule(nn.Module):

    def __init__(self, input_size, hidden_size,sizes, dropout=0, lstm_layers=1,embeddings=None):
        super(HateDetectionModule,self).__init__()
        self.init_classifier(sizes,dropout)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, dropout=dropout, bidirectional=True, num_layers=lstm_layers)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout)

        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                init.xavier_uniform_(param)

        self.embeddings = nn.Embedding(len(embeddings), 300)
        self.embeddings.weight.data.copy_(embeddings)


    def init_classifier(self,sizes, dropout=0):
        sequence = []
        for i in range(len(sizes)-1):
            sequence += [nn.Linear(sizes[i],sizes[i+1]), nn.ReLU(), nn.Dropout(dropout)]
        self.classifier = nn.Sequential(*sequence[:-2])
        for name, param in self.classifier.named_parameters():
            if 'weight' in name:
                init.xavier_uniform_(param)


    def forward(self,x):

        input,lengths = x


        embeddings = self.embeddings(input)
        embeddings = self.dropout(embeddings)

        lens = lengths.cpu()
        #breakpoint()
        padded = pack_padded_sequence(embeddings,lens, batch_first=True,enforce_sorted=False)

        #breakpoint()

        o, (h, c) = self.lstm(padded)
        #breakpoint()

        out = torch.cat((h[-2,:,:], h[-1,:,:]), dim = 1) #pad_packed_sequence(o,batch_first=True)[0].squeeze()

        output = self.classifier(out).squeeze()
        return self.sigmoid(output)




In [41]:

train_dataset = HaSpeeDe_Dataset(ROOT_PATH + "/train-taskA.jsonl", use_embeddings=True)#, stopwords_file_path=ROOT_PATH + "/stopwords-it.txt")
train_dataset.print_data_analysis()
encoder = train_dataset.encoding
embeddings = train_dataset.embeddings



In [42]:
val_data = train_dataset.split(0.2)
val_dataset = HaSpeeDe_Dataset("", data=val_data)

test_dataset = HaSpeeDe_Dataset(ROOT_PATH + "/test-news-taskA.jsonl", use_embeddings=True)#, stopwords_file_path=ROOT_PATH + "/stopwords-it.txt", encoder=encoder) #/test-tweets-taskA.jsonl




In [43]:
device = "cuda"


train_dataset.device = device
val_dataset.device = device
test_dataset.device = device

train_loader = train_dataset.get_dataloader(64, True)
val_loader = val_dataset.get_dataloader(64, True)
test_loader = test_dataset.get_dataloader(64, True)

In [15]:
model = BaselineSimpleModel(300,1)


trainer = Trainer(model,train_loader, val_loader, Adam(model.parameters(), lr=2e-1, weight_decay=1e-4), nn.BCELoss(), device,test_dataloader=test_loader)

In [44]:
sizes = [1024, 2000, 500, 1000,1]
embeddings = train_dataset.embeddings
embeddings = sorted(list(embeddings.items()), key=lambda x: x[0])
embeddings = torch.tensor([x[1] for x in embeddings])
print(embeddings.shape)
hate_detector = HateDetectionModule( 300, 512,sizes, dropout=0.2, lstm_layers=2, embeddings=embeddings)


hate_trainer = Trainer(hate_detector, train_loader, val_loader, AdamW(hate_detector.parameters(), lr=5e-4), nn.BCELoss(), device,test_dataloader=test_loader)

torch.Size([24590, 300])


In [45]:
wandb.finish()
hate_trainer.train(10, use_wandb=True , name="bilstm-2lstm-4lin-lr5e-4-drop.2-512", config={"classifier": sizes})





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁▇█▇▇▇▆▅▆▅
f1,▃▅█▇▆▇▅▅▄▁
precision,▁█▇▆▇▆▆▅▇▇
recall,█▄▇▆▆▆▅▅▃▁
train_loss,█▇▅▄▃▂▂▂▁▁
validation_loss,▂▁▁▂▃▅▄▃█▆

0,1
accuracy,0.7425
f1,0.66219
precision,0.70408
recall,0.625
train_loss,0.0314
validation_loss,1.20892


In [None]:
model_name = ''
hate_detector.load_state_dict(torch.load(model_name))
wandb.finish()
hate_trainer.train(10, use_wandb=True , name="bilstm-2lstm-4lin-lr5e-4-drop.2-512", config={"classifier": sizes})

In [None]:

baseline_model = BaselineStratifiedModel(train_dataset.neutral_count, train_dataset.hateful_count)


trainer_baseline = Trainer(baseline_model,train_loader, val_loader, Adam(model.parameters()), nn.BCELoss(), "cpu",test_dataloader=test_loader)




In [None]:

validation_loss, precision, recall, f1, accuracy = trainer_baseline.validate()





In [None]:


trainer.train(30, use_wandb=True , name="simple-baseline-lr0.2")




KeyboardInterrupt: 

In [None]:


print("Validation Loss:", validation_loss)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Validation Loss: 46.73604254289107
Precision: 0.4165103189493433
Recall: 0.4014466546112116
F1 Score: 0.4088397790055248
Accuracy: 0.5303584491587418


In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))