In [1]:
from google.colab import drive
import jsondrive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import numpy as np
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import fasttext.util
from torch.nn.utils.rnn import pad_sequence



In [None]:
# "neutrale", "odio"

ROOT_PATH = "/content/drive/MyDrive/uni/nlp/nlp2024-hw1-b"



In [None]:


class HaSpeeDe_Dataset(Dataset):
    
    
    def __init__(self, data_path: str, data: list[tuple[list, int]]=None, use_embeddings: bool=False) -> None:
        if data is not None:
            self.data = data
        else:
            if use_embeddings:
                fasttext.util.download_model('it', if_exists='ignore')
                embeddings = fasttext.load_model('cc.it.300.bin')
            self.data = []
            with open(data_path, 'r', encoding="UTF8") as f:
                for line in f:
                    item = json.loads(line)
                    sentence = item['text'].split()
                    if use_embeddings:
                        embedded_sentence = []
                        for word in sentence:
                            embedded_sentence.append(embeddings.get_word_vector(word))
                        sentence = embedded_sentence  
                    self.data.append((sentence, item['label']))
                    
    def __len__(self) -> int:
        return len(self.data)
    
    def __getitem__(self, idx: int) -> tuple[list, int]:
        return self.data[idx]
    
    def split(self, prc: float) -> list[tuple[list, int]]:
        validation_size = int(prc * len(self.data))
        train_size = len(self.data) - validation_size
        validation_data, self.data = torch.utils.data.random_split(self.data, [validation_size, train_size])
        return validation_data
    
    
    def collate(self, batch: list[tuple[list, int]]) -> tuple[torch.Tensor, torch.Tensor]:
        texts, labels = zip(*batch)
        texts = pad_sequence([torch.tensor(text) for text in texts], batch_first=True)
        return texts, torch.tensor(labels)
    
    def get_dataloader(self, batch_size: int, shuffle: bool) -> DataLoader:
        return DataLoader(self, batch_size=batch_size, shuffle=shuffle)







