In [1]:
import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
import pandas as pd
import time
from sklearn.metrics import f1_score


In [2]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")
	

Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU


# Load Dataset

In [17]:
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import unicodedata
import re
import spacy
from tqdm import tqdm

SOS_token = 1
EOS_token = 2
CLS_token = 3
lemmatizer = spacy.load("en_core_web_sm")

class TranslationDataset(Dataset):
	def __init__(self, csv_path, dataset_type='train', vocab=None):
		df = pd.read_csv(csv_path, quoting=3)
		print(f'len df: {len(df)}')
		if dataset_type in ['train', 'val']:
			self.text, self.labels = zip(*[(text, label) for text, label in zip(df['text'], df['label'])])
		else:
			self.text = df['text'].tolist()
			self.labels = [0 for _ in range(len(self.text))]
		self.ids = df['id'].tolist()
		self.dataset_type = dataset_type
		self.tokenizer = get_tokenizer('basic_english')
		self._preprocess(vocab)

	def _preprocess(self, vocab):
		# preprocess text
		self.text = [self._preprocess_sentence(text) for text in tqdm(self.text)]

		if vocab is None:
			self.vocab = build_vocab_from_iterator(self._yield_tokens(), specials=["<unk>"])
			self.vocab.set_default_index(self.vocab['<UNK>']) # Set the default index to <UNK> token, which mean 
			self.vocab.insert_token('<SOS>', SOS_token)  # Insert <EOS> token with index 1
			self.vocab.insert_token('<EOS>', EOS_token)  # Insert <EOS> token with index 1
			self.vocab.insert_token('<CLS>', CLS_token)  # Insert <CLS> token with index 2
		else:
			self.vocab = vocab
			
		self.vocab_size = len(self.vocab)
	
	def _preprocess_sentence(self, sentence):
		sentence = normalizeString(sentence)
		sentence = self.tokenizer(sentence)
		sentence = lemmaString(sentence)
		return sentence

	def _yield_tokens(self):
		for text_sample in self.text:
			yield text_sample

	def __len__(self):
		return len(self.text)

	def __getitem__(self, idx):
		input_seq = text_to_indices(self.vocab, self.text[idx])
		label = self.labels[idx]
		return input_seq, label, self.ids[idx]

def unicodeToAscii(s):
	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
	)

def normalizeString(s):
	s = unicodeToAscii(s.lower().strip())
	s = re.sub(r"([.!?])", r" \1", s)
	s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
	return s

def lemmaString(tokens):
	return [token.lemma_ for token in lemmatizer(' '.join(tokens))]

def text_to_indices(vocab, tokens):
	indices = [vocab[token] for token in tokens]
	# add EOS token at the end
	indices.append(EOS_token)
	return torch.tensor(indices, dtype=torch.long).view(-1)

def seq_to_tokens(seq, vocab):
    itos = vocab.get_itos()
    return [itos[idx] for idx in seq]

In [18]:
trainset = TranslationDataset('train_2024.csv', dataset_type='train', vocab=None)

len df: 99000


 13%|█▎        | 12569/99000 [01:43<13:47, 104.47it/s]

In [None]:
print(trainset.vocab['<UNK>'])
print(trainset.vocab['<SOS>'])
print(trainset.vocab['<EOS>'])
print(trainset.vocab['<CLS>'])

In [6]:
trainset = torch.load('datasets/trainset.pth')
valset = torch.load('datasets/valset.pth')
testset = torch.load('datasets/testset.pth')

# Custom DataLoader

In [18]:
PADDING_VALUE = 0

In [23]:
from torch.nn.utils.rnn import pad_sequence

def collate(list_of_samples):
	"""Merges a list of samples to form a mini-batch.

	Args:
	list_of_samples is a list of tuples (src_seq, tgt_label, id):
		src_seq is of shape (src_seq_length,)
		tgt_label is of shape (1,)
		id is an int

	Returns:
	src_seqs of shape (max_src_seq_length, batch_size): Tensor of padded source sequences.
	src_mask of shape (max_src_seq_length, batch_size): Boolean tensor showing which elements of the src_seqs tensor should be ignored in computations.
	tgt_labels of shape (batch_size, 1): Tensor of labels for each sequence.
	"""
	# YOUR CODE HERE
	src_seqs = [s[0] for s in list_of_samples]
	src_seqs = pad_sequence(src_seqs, batch_first=False, padding_value=PADDING_VALUE, )
	src_masks = (src_seqs == PADDING_VALUE)

	tgt_labels = torch.LongTensor([s[1] for s in list_of_samples])
	
	# Add CLS token at the beginning of each src sequence
	cls_tensor = torch.full((1, src_seqs.shape[1]), CLS_token, dtype=torch.long)
	src_seqs = torch.cat((cls_tensor, src_seqs), dim=0)
	
	ids = [s[2] for s in list_of_samples]

	return src_seqs, src_masks, tgt_labels, ids


In [21]:
trainloader = DataLoader(dataset=trainset, batch_size=640, shuffle=False, collate_fn=collate, pin_memory=True)
valloader = DataLoader(dataset=valset, batch_size=250, shuffle=False, collate_fn=collate, pin_memory=True)
testloader = DataLoader(dataset=testset, batch_size=1, shuffle=False, collate_fn=collate, pin_memory=True)

In [22]:
# This cell tests collate()
def test_collate():
    pairs = [
        (torch.LongTensor([2, EOS_token]), 0, 0),
        (torch.LongTensor([6, 7, EOS_token]), 1, 1),
    ]
    src_seqs, src_mask, tgt_seqs, ids = collate(pairs)
    #src_seqs, src_mask, tgt_seqs = src_seqs[:, [1, 0]], src_mask[:, [1, 0]], tgt_seqs[:, [1, 0]]
    print('src_seqs:\n', src_seqs)
    print('src_mask:\n', src_mask)
    print('tgt_seqs:\n', tgt_seqs)
    expected_src_seqs = torch.tensor([
        [CLS_token, CLS_token],
        [2,         6],
        [EOS_token, 7],
        [0,         EOS_token]
    ])
    expected_src_mask = torch.tensor([
        [False, False],
        [False, False],
        [ True, False]
    ])
    expected_tgt_seqs = torch.tensor([0, 1])
    
    assert ((
        (src_seqs == expected_src_seqs).all()
         and (src_mask == expected_src_mask).all()
         and (tgt_seqs == expected_tgt_seqs).all()
        ) or (
        (src_seqs == expected_src_seqs[:, [1, 0]]).all()
         and (src_mask == expected_src_mask[:, [1, 0]]).all()
         and (tgt_seqs == expected_tgt_seqs[:, [1, 0]]).all()
        )
    ), "Wrong outputs of collate."
    print('Success')

test_collate()

src_seqs:
 tensor([[2, 2],
        [2, 6],
        [1, 7],
        [0, 1]])
src_mask:
 tensor([[False, False],
        [False, False],
        [ True, False]])
tgt_seqs:
 tensor([0, 1])
Success


# Encoder-only Model

In [24]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class EncoderOnly(nn.Module):
	def __init__(self, vocab_size, d_model, max_seq_length, nhead, num_layers, dim_feedforward, dropout):
		super(EncoderOnly, self).__init__()
		
		self.embedding = nn.Embedding(vocab_size, d_model)
		self.pos_embedding = nn.Embedding(max_seq_length, d_model)
		
		encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
		self.encoder = TransformerEncoder(encoder_layer, num_layers)
		
		fc_hidden = 128
		self.fc1 = nn.Linear(d_model, fc_hidden)
		self.dropout = nn.Dropout(dropout)
		self.fc2 = nn.Linear(fc_hidden, 1)
		
	def forward(self, src, src_mask):
		"""
		Args:
		src of shape (max_seq_length, batch_size): Input sequence.
		src_mask of shape (max_seq_length, batch_size): Boolean tensor indicating which elements should be ignored.
		
		Returns:
		probs of shape (batch_size,): Probabilities of the sequences being toxic.
		"""
		# src: (max_seq_length, batch_size)
		# permute to (batch_size, max_seq_length)
		src = src.permute(1, 0)
		
		seq_length = src.shape[1]	# src: (batch_size, max_seq_length)
		
		# Create positional indices
		positions = torch.arange(seq_length, dtype=torch.long, device=src.device).unsqueeze(0) # shape (1, max_seq_length)
		
		# Embed the input sequences and add positional embeddings
		embeddings = self.embedding(src) + self.pos_embedding(positions) # (batch_size, max_seq_length, d_model)
		
		# Transpose the embeddings for input to the encoder
		embeddings = embeddings.transpose(0, 1)  # (max_seq_length, batch_size, d_model)
		
		# Pass the embeddings through the encoder
		encoder_output = self.encoder(embeddings, src_key_padding_mask=src_mask) # (max_seq_length, batch_size, d_model)
		
		# Take the first output of the encoder (corresponding to the CLS token)
		cls_output = encoder_output[0] # (batch_size, d_model)
		
		# Pass the averaged output through the fully connected layer for classification
		out = self.fc1(cls_output)
		out = F.relu(out)
		out = self.dropout(out)
		out = self.fc2(out)
		probs = torch.sigmoid(out) # (batch_size, )
		
		return probs

# Training

In [29]:
def val_loss(model, val_loader):
	model.eval()
	total_loss = 0
	criterion = nn.BCELoss()
	with torch.no_grad():
		for i, (src_seqs, src_mask, tgt_labels, ids) in enumerate(val_loader):
			src_seqs, tgt_labels, src_mask = src_seqs.to(device), tgt_labels.to(device), src_mask.to(device)
			outputs = model(src_seqs, src_mask)
			loss = criterion(outputs.squeeze(), tgt_labels.float())
			total_loss += loss.item()
	return total_loss / len(val_loader)

## Encoder Only model

In [None]:
# Encoder config
encoder = EncoderOnly(vocab_size=trainset.vocab_size, d_model=512, max_seq_length=1000, nhead=8, num_layers=2, dim_feedforward=2048, dropout=0.1)

In [None]:
# training
criterion = nn.BCELoss()
optimizer = optim.Adam(encoder.parameters(), lr=0.001, weight_decay=1e-5)

n_epochs = 10
# prompt ask for continue training or not
cont = input('Continue training? yes or no')
if cont == 'no':
	print('Fresh training')
	train_losses = []
	val_losses = []
	best_model = None
	best_val_loss = float('inf')
else:
	print(f'Continue training')

for epoch in range(n_epochs):
	encoder.train()
	running_loss = 0.0
	epoch_start_time = time.time()
	print(f'Number of batches: {len(trainloader)}')
	print(f'batch_size: {trainloader.batch_size}')
	for i, (src_seqs, src_mask, tgt_labels, ids) in enumerate(trainloader):
		start_time = time.time()
		src_seqs, src_mask, tgt_labels = src_seqs.to(device), src_mask.to(device), tgt_labels.to(device)
		optimizer.zero_grad()
		outputs = encoder(src_seqs, src_mask)
		loss = criterion(outputs.squeeze(), tgt_labels.float())
		loss.backward()
		optimizer.step()
		running_loss += loss.item()
		if i % 10 == 9:
			print(f'Epoch {epoch + 1}, iter {i + 1}: avg. loss = {running_loss/(i + 1):.4f}, Time spent: {time.time()-start_time:.2f}s')
	train_losses.append(running_loss / len(trainloader))
	eval_loss = val_loss(encoder, valloader)

	if eval_loss < best_val_loss:
		best_val_loss = eval_loss
		best_model = encoder.state_dict()
		if best_model is not None:
			print(f'find new best model, save to models/lstm_glove.pth, eval_loss: {eval_loss:.4f}')
			torch.save(best_model, os.path.join('models', 'lstm_glove.pth'))

	# if early_stop.stop_criterion(val_losses):
	# 	print(f'Early stopping at epoch {epoch + 1}')
	# 	break
	
	val_losses.append(eval_loss)
	print(f'Epoch {epoch + 1}, val loss = {eval_loss:.4f}, train loss = {train_losses[-1]:.4f}; Time spent: {time.time()-epoch_start_time:.2f}s')
	running_loss = 0.0

In [27]:
# loop through the trainset to get the sequence length
pad_src_seqs, src_seq_lengths, pad_tgt_seqs, ids = next(iter(trainloader))
print(pad_src_seqs.shape)

torch.Size([206, 640])
