In [1]:
import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
import pandas as pd
import time
from sklearn.metrics import f1_score


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")
	

Using MPS


# Load Dataset

In [3]:
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import unicodedata
import re
import spacy

EOS_token = 1
CLS_token = 2
lemmatizer = spacy.load("en_core_web_sm")

class TranslationDataset(Dataset):
	def __init__(self, csv_path, dataset_type='train', vocab=None):
		df = pd.read_csv(csv_path, quoting=3)
		print(f'len df: {len(df)}')
		if dataset_type in ['train', 'val']:
			self.text, self.labels = zip(*[(text, label) for text, label in zip(df['text'], df['label'])])
		else:
			self.text = df['text'].tolist()
			self.labels = [0 for _ in range(len(self.text))]
		self.ids = df['id'].tolist()
		self.dataset_type = dataset_type
		self.tokenizer = get_tokenizer('basic_english')
		self._preprocess(vocab)

	def _preprocess(self, vocab):
		# preprocess text
		self.text = [self._preprocess_sentence(text) for text in self.text]

		if vocab is None:
			self.vocab = build_vocab_from_iterator(self._yield_tokens(), specials=["<unk>"])
			self.vocab.set_default_index(self.vocab['<UNK>'])
			self.vocab.insert_token('<EOS>', EOS_token)  # Insert <EOS> token with index 1
			self.vocab.insert_token('<CLS>', CLS_token)  # Insert <CLS> token with index 2
		else:
			self.vocab = vocab
			
		self.vocab_size = len(self.vocab)
	
	def _preprocess_sentence(self, sentence):
		sentence = normalizeString(sentence)
		sentence = self.tokenizer(sentence)
		sentence = lemmaString(sentence)
		return sentence

	def _yield_tokens(self):
		for text_sample in self.text:
			yield text_sample

	def __len__(self):
		return len(self.text)

	def __getitem__(self, idx):
		input_seq = text_to_indices(self.vocab, self.text[idx])
		label = self.labels[idx]
		return input_seq, label, self.ids[idx]

def unicodeToAscii(s):
	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
	)

def normalizeString(s):
	s = unicodeToAscii(s.lower().strip())
	s = re.sub(r"([.!?])", r" \1", s)
	s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
	return s

def lemmaString(tokens):
	return [token.lemma_ for token in lemmatizer(' '.join(tokens))]

def text_to_indices(vocab, tokens):
	indices = [vocab[token] for token in tokens]
	# add CLS token at the beginning
	indices.insert(0, CLS_token)
	# add EOS token at the end
	indices.append(EOS_token)
	return torch.tensor(indices, dtype=torch.long).view(-1)

def seq_to_tokens(seq, vocab):
    itos = vocab.get_itos()
    return [itos[idx] for idx in seq]

In [4]:
trainset = torch.load('dataloaders/trainset.pth')
valset = torch.load('dataloaders/valset.pth')
testset = torch.load('dataloaders/testset.pth')

# Custom DataLoader

In [5]:
PADDING_VALUE = 0

In [6]:
from torch.nn.utils.rnn import pad_sequence

def collate(list_of_samples):
	"""Merges a list of samples to form a mini-batch.

	Args:
	list_of_samples is a list of tuples (src_seq, tgt_label, id):
		src_seq is of shape (src_seq_length,)
		tgt_label is of shape (1,)
		id is an int

	Returns:
	src_seqs of shape (max_src_seq_length, batch_size): Tensor of padded source sequences.
	src_mask of shape (max_src_seq_length, batch_size): Boolean tensor showing which elements of the src_seqs tensor should be ignored in computations.
	tgt_labels of shape (batch_size, 1): Tensor of labels for each sequence.
	"""
	# YOUR CODE HERE
	src_seqs = [s[0] for s in list_of_samples]
	src_seqs = pad_sequence(src_seqs, batch_first=False, padding_value=PADDING_VALUE)
	src_masks = (src_seqs == PADDING_VALUE)

	tgt_labels = torch.LongTensor([s[1] for s in list_of_samples])
	# src_seq_lengths = [len(s) for s in src_seqs]
	ids = [s[2] for s in list_of_samples]

	return src_seqs, src_masks, tgt_labels, ids


In [7]:
trainloader = DataLoader(dataset=trainset, batch_size=640, shuffle=False, collate_fn=collate, pin_memory=True)
valloader = DataLoader(dataset=valset, batch_size=250, shuffle=False, collate_fn=collate, pin_memory=True)
testloader = DataLoader(dataset=testset, batch_size=1, shuffle=False, collate_fn=collate, pin_memory=True)

In [8]:
# This cell tests collate()
def test_collate():
    pairs = [
        (torch.LongTensor([2, EOS_token]), 0, 0),
        (torch.LongTensor([6, 7, EOS_token]), 1, 1),
    ]
    src_seqs, src_mask, tgt_seqs, ids = collate(pairs)
    #src_seqs, src_mask, tgt_seqs = src_seqs[:, [1, 0]], src_mask[:, [1, 0]], tgt_seqs[:, [1, 0]]
    print('src_seqs:\n', src_seqs)
    print('src_mask:\n', src_mask)
    print('tgt_seqs:\n', tgt_seqs)
    expected_src_seqs = torch.tensor([
        [2,         6],
        [EOS_token, 7],
        [0,         EOS_token]
    ])
    expected_src_mask = torch.tensor([
        [False, False],
        [False, False],
        [ True, False]
    ])
    expected_tgt_seqs = torch.tensor([0, 1])
    
    assert ((
        (src_seqs == expected_src_seqs).all()
         and (src_mask == expected_src_mask).all()
         and (tgt_seqs == expected_tgt_seqs).all()
        ) or (
        (src_seqs == expected_src_seqs[:, [1, 0]]).all()
         and (src_mask == expected_src_mask[:, [1, 0]]).all()
         and (tgt_seqs == expected_tgt_seqs[:, [1, 0]]).all()
        )
    ), "Wrong outputs of collate."
    print('Success')

test_collate()

src_seqs:
 tensor([[2, 6],
        [1, 7],
        [0, 1]])
src_mask:
 tensor([[False, False],
        [False, False],
        [ True, False]])
tgt_seqs:
 tensor([0, 1])
Success


In [9]:
trainset = TranslationDataset('train_2024.csv')
trainloader = DataLoader(dataset=trainset, batch_size=640, shuffle=False, collate_fn=collate, pin_memory=True)

len df: 99000


# Encoder-only Model