In [1]:
import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
import pandas as pd
import time


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cpu")

# Load data

In [3]:
df = pd.read_csv('train_2024.csv')
df.head()

Unnamed: 0,id,text,label
0,0,Except that Desmond played first base last nig...,0
1,1,What i find funny is the loyalty and blindness...,0
2,2,Read the article not just the headline & you ...,0
3,3,Speaking of a horses backside is that where y...,1
4,4,Michael Barone- gee are you dumb. No other wo...,1


In [4]:
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import unicodedata
import re

EOS_token = 1
class TranslationDataset(Dataset):
	def __init__(self, csv_path, dataset_type='train'):
		df = pd.read_csv(csv_path)
		self.text, self.labels = zip(*[(text, label) for text, label in zip(df['text'], df['label'])])
		self.dataset_type = dataset_type
		self._preprocess()

	def _preprocess(self):
		self.tokenizer = get_tokenizer('basic_english')
		self.vocab = build_vocab_from_iterator(self._yield_tokens(), specials=["<unk>"])
		self.vocab.set_default_index(self.vocab['<unk>'])
		self.vocab.insert_token('<eos>', EOS_token)  # Insert <eos> token with index 1
		self.vocab_size = len(self.vocab)
		
	def _yield_tokens(self):
		for text_sample in self.text:
			# preprocess text
			text_sample = normalizeString(text_sample)
			yield self.tokenizer(text_sample)

	def __len__(self):
		return len(self.text)

	def __getitem__(self, idx):
		input_seq = text_to_indices(self.tokenizer, self.vocab, self.text[idx])
		label = self.labels[idx]
		return input_seq, label

def unicodeToAscii(s):
	return ''.join(
		c for c in unicodedata.normalize('NFD', s)
		if unicodedata.category(c) != 'Mn'
	)

def normalizeString(s):
	s = unicodeToAscii(s.lower().strip())
	s = re.sub(r"([.!?])", r" \1", s)
	s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
	return s

def text_to_indices(tokenizer, vocab, text_sample):
	tokens = tokenizer(text_sample)
	indices = [vocab[token] for token in tokens]
	indices.append(EOS_token)
	return torch.tensor(indices, dtype=torch.long).view(-1)

def seq_to_tokens(seq, vocab):
    itos = vocab.get_itos()
    return [itos[idx] for idx in seq]

In [5]:
trainset = TranslationDataset('tmp.csv')

In [6]:
src_sentence, label = trainset[100]
print(src_sentence, label)
print(type(src_sentence), type(label))

tensor([ 65,   8,  29,  65,  80,   2, 105,  50,  52,  28,   6, 749, 457,  79,
         25, 536,  47,  23,  26,  16,   1]) 1
<class 'torch.Tensor'> <class 'int'>


# DataLoader

In [7]:
PADDING_VALUE = 0

In [8]:
from torch.nn.utils.rnn import pad_sequence

def collate(list_of_samples):
	"""Merges a list of samples to form a mini-batch.

	Args:
	list_of_samples is a list of tuples (src_seq, tgt_label):
		src_seq is of shape (src_seq_length,)
		tgt_label is of shape (1,)

	Returns:
	src_seqs of shape (max_src_seq_length, batch_size): Tensor of padded source sequences.
		The sequences should be sorted by length in a decreasing order, that is src_seqs[:,0] should be
		the longest sequence, and src_seqs[:,-1] should be the shortest.
	src_seq_lengths: List of lengths of source sequences.
	tgt_labels of shape (batch_size, 1): Tensor of labels for each sequence.
	"""
	# YOUR CODE HERE
	src_seqs = [s[0] for s in list_of_samples]
	tgt_labels = torch.LongTensor([s[1] for s in list_of_samples])
	src_seq_lengths = [len(s) for s in src_seqs]
	src_seqs = pad_sequence(src_seqs, padding_value=PADDING_VALUE)

	src_seq_lengths, indices = torch.sort(torch.tensor(src_seq_lengths), descending=True)
	src_seqs = src_seqs[:, indices]
	tgt_labels = tgt_labels[indices]

	return src_seqs, src_seq_lengths.tolist(), tgt_labels


In [9]:
def test_collate_shapes():
    pairs = [
        (torch.LongTensor([1, 2]), 1),
        (torch.LongTensor([6, 7, 8]), 0),
    ]
    pad_src_seqs, src_seq_lengths, pad_tgt_seqs = collate(pairs)
    assert type(src_seq_lengths) == list, "src_seq_lengths should be a list."
    assert pad_src_seqs.shape == torch.Size([3, 2]), f"Bad pad_src_seqs.shape: {pad_src_seqs.shape}"
    assert pad_src_seqs.dtype == torch.long
    assert pad_tgt_seqs.shape == torch.Size([2]), f"Bad pad_tgt_seqs.shape: {pad_tgt_seqs.shape}"
    assert pad_tgt_seqs.dtype == torch.long
    print('Success')

test_collate_shapes()

Success


In [10]:
# This cell tests collate() function

def test_collate_fn():
    pairs = [
        (torch.tensor([1, 2]), 0),
        (torch.tensor([6, 7, 8]), 1),
        (torch.tensor([11, 12, 13, 14]), 0),
    ]
    pad_src_seqs, src_seq_lengths, pad_tgt_seqs = collate(pairs)
    assert pad_src_seqs.shape == torch.Size([4, 3]), f"Bad pad_src_seqs.shape: {pad_src_seqs.shape}"
    assert pad_tgt_seqs.shape == torch.Size([3]), f"Bad pad_tgt_seqs.shape: {pad_tgt_seqs.shape}"
    print('Source sequences combined:')
    print(pad_src_seqs)
    expected = torch.tensor([
      [11, 6, 1],
      [12, 7, 2],
      [13, 8, 0],
      [14, 0, 0],
    ])
    assert (pad_src_seqs == expected).all(), "pad_src_seqs does not match expected values"

    print(src_seq_lengths)
    if isinstance(src_seq_lengths[0], torch.Size):
        src_seq_lengths = sum((list(l) for l in src_seq_lengths), [])
    else:
        src_seq_lengths = [int(l) for l in src_seq_lengths]
    assert src_seq_lengths == [4, 3, 2], f"Bad src_seq_lengths: {src_seq_lengths}"

    print('Target sequences combined:')
    print(pad_tgt_seqs)
    expected = torch.tensor([
      0, 1, 0
    ])
    assert (pad_tgt_seqs == expected).all(), "pad_tgt_seqs0 does not match expected values"
    print('Success')

test_collate_fn()


Source sequences combined:
tensor([[11,  6,  1],
        [12,  7,  2],
        [13,  8,  0],
        [14,  0,  0]])
[4, 3, 2]
Target sequences combined:
tensor([0, 1, 0])
Success


In [11]:
# We create custom DataLoader using the implemented collate function
# We are going to process 64 sequences at the same time (batch_size=64)
trainset = TranslationDataset('train_2024.csv')
trainloader = DataLoader(dataset=trainset, batch_size=640, shuffle=False, collate_fn=collate, pin_memory=True)

In [12]:
# Test data loader
for i, (src_seqs, src_seq_lengths, tgt_seqs) in enumerate(trainloader):
    print(f"Batch {i} src_seqs:")
    print(src_seqs)
    print(f'src_seqs.shape: {src_seqs.shape}')
    print(f"Batch {i} src_seq_lengths:")
    print(src_seq_lengths)
    print(f"Batch {i} tgt_seqs:")
    print(tgt_seqs)
    print(f'tgt_seqs.shape: {tgt_seqs.shape}')
    break

Batch 0 src_seqs:
tensor([[  238,    89,   168,  ..., 63951,   104,    89],
        [   38,   204,   161,  ...,     5,   168,  2852],
        [   10,    32,   839,  ..., 59655,     2,     2],
        ...,
        [  705,     0,     0,  ...,     0,     0,     0],
        [    2,     0,     0,  ...,     0,     0,     0],
        [    1,     0,     0,  ...,     0,     0,     0]])
src_seqs.shape: torch.Size([216, 640])
Batch 0 src_seq_lengths:
[216, 207, 203, 200, 196, 193, 193, 192, 190, 186, 183, 182, 182, 181, 180, 177, 175, 172, 172, 171, 170, 167, 166, 164, 156, 156, 154, 151, 144, 143, 139, 138, 138, 130, 129, 128, 128, 127, 126, 125, 125, 125, 121, 120, 117, 117, 117, 116, 115, 115, 114, 114, 111, 109, 109, 108, 108, 108, 107, 107, 107, 106, 106, 106, 106, 105, 105, 105, 104, 103, 102, 97, 97, 97, 96, 96, 95, 94, 94, 93, 93, 93, 91, 89, 89, 89, 89, 88, 88, 87, 87, 85, 84, 84, 83, 83, 83, 83, 82, 82, 81, 80, 80, 80, 80, 80, 79, 79, 79, 78, 77, 77, 76, 75, 74, 73, 72, 72, 72, 72, 72, 

# LSTM

In [13]:
class LSTM(nn.Module):
	def __init__(self, src_dictionary_size, embed_size, hidden_size, dropout=0.2):
		"""
		Args:
		src_dictionary_size: The number of words in the source dictionary.
		embed_size: The number of dimensions in the word embeddings.
		hidden_size: The number of features in the hidden state of GRU.
		"""
		super(LSTM, self).__init__()
		self.hidden_size = hidden_size
		self.embedding = nn.Embedding(src_dictionary_size, embed_size)
		self.lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=2, batch_first=False, dropout=dropout, bidirectional=False)
		self.fc1 = nn.Linear(hidden_size, hidden_size)
		self.fc2 = nn.Linear(hidden_size, 1)
		self.relu = nn.ReLU()
		self.sigmoid = nn.Sigmoid()
		
	def forward(self, pad_seqs, seq_lengths, hidden):
		"""
		Args:
		pad_seqs of shape (max_seq_length, batch_size): Padded source sequences.
		seq_lengths: List of sequence lengths.
		hidden of shape (1, batch_size, hidden_size): Initial states of the GRU.

		Returns:
		outputs of shape (max_seq_length, batch_size, hidden_size): Padded outputs of GRU at every step.
		hidden of shape (1, batch_size, hidden_size): Updated states of the GRU.
		"""
		# YOUR CODE HERE
		embedded = self.embedding(pad_seqs)
		packed = pack_padded_sequence(embedded, seq_lengths)
		outputs, hidden = self.lstm(packed, hidden)
		outputs, output_lengths = pad_packed_sequence(outputs, batch_first=False)
		last_timesteps = torch.stack([outputs[length-1, i] for i, length in enumerate(output_lengths)]) # shape: (batch_size, hidden_size)
		# feed through the fully connected layer
		outputs = self.fc1(last_timesteps)
		outputs = self.relu(outputs)
		outputs = self.fc2(outputs)
		outputs = self.sigmoid(outputs)
		return outputs

	def init_hidden(self, batch_size=1, device='cpu'):
		num_directions = 1
		return (
            torch.zeros(self.lstm.num_layers * num_directions, batch_size, self.hidden_size).to(device),
            torch.zeros(self.lstm.num_layers * num_directions, batch_size, self.hidden_size).to(device),
        )

In [14]:
def test_LSTM_shapes():
    hidden_size = 3
    lstm = LSTM(src_dictionary_size=5, embed_size=10, hidden_size=hidden_size)

    max_seq_length = 4
    batch_size = 2
    hidden = lstm.init_hidden(batch_size=batch_size)
    pad_seqs = torch.tensor([
        [        1,             2],
        [        2,     EOS_token],
        [        3, PADDING_VALUE],
        [EOS_token, PADDING_VALUE]
    ])

    outputs = lstm.forward(pad_seqs=pad_seqs, seq_lengths=[4, 2], hidden=hidden)
    assert outputs.shape == torch.Size([batch_size, 1]), f"Bad outputs.shape: {outputs.shape}"
    print('Success')

test_LSTM_shapes()

Success


# Training

In [15]:
# Create the LSTM model
hidden_size = embed_size = 256
lstm = LSTM(trainset.vocab_size, embed_size, hidden_size).to(device)

In [16]:
def val_loss(model, val_loader):
	model.eval()
	total_loss = 0
	criterion = nn.BCELoss()
	with torch.no_grad():
		for i, (src_seqs, src_seq_lengths, tgt_labels) in enumerate(val_loader):
			src_seqs, tgt_labels = src_seqs.to(device), tgt_labels.to(device)
			hidden = model.init_hidden(src_seqs.shape[1], device=device)
			outputs = model(src_seqs, src_seq_lengths, hidden)
			loss = criterion(outputs.squeeze(), tgt_labels.float())
			total_loss += loss.item()
	return total_loss / len(val_loader)

In [17]:
# We create custom DataLoader using the implemented collate function
# We are going to process 64 sequences at the same time (batch_size=64)
valset = TranslationDataset('dev_2024.csv')
valloader = DataLoader(dataset=trainset, batch_size=256, shuffle=False, collate_fn=collate, pin_memory=True)

In [18]:
# training
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm.parameters(), lr=0.001)

n_epochs = 30
train_losses = []
val_losses = []
for epoch in range(n_epochs):
	lstm.train()
	running_loss = 0.0
	epoch_start_time = time.time()
	print(f'Number of batches: {len(trainloader)}')
	print(f'batch_size: {trainloader.batch_size}')
	for i, (src_seqs, src_seq_lengths, tgt_labels) in enumerate(trainloader):
		start_time = time.time()
		src_seqs, tgt_labels = src_seqs.to(device), tgt_labels.to(device)
		hidden = lstm.init_hidden(src_seqs.shape[1], device=device)
		optimizer.zero_grad()
		outputs = lstm(src_seqs, src_seq_lengths, hidden)
		loss = criterion(outputs.squeeze(), tgt_labels.float())
		loss.backward()
		optimizer.step()
		running_loss += loss.item()
		if i % 1 == 0:
			print(f'Epoch {epoch + 1}, iter {i + 1}: avg. loss = {running_loss/(i + 1):.4f}, Time spent: {time.time()-start_time:.2f}s')
	train_losses.append(running_loss / len(trainloader))
	eval_loss = val_loss(lstm, valloader)
	val_losses.append(eval_loss)
	print(f'Epoch {epoch + 1}, val loss = {eval_loss:.4f}, train loss = {train_losses[-1]:.4f}; Time spent: {time.time()-epoch_start_time:.2f}s')
	running_loss = 0.0

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


Number of batches: 155
batch_size: 640
Epoch 1, iter 1: avg. loss = 0.6904, Time spent: 15.41s


# Inference

In [233]:
def classify(lstm, pad_src_seqs, src_seq_lengths):
    """Translate sequences from the source language to the target language using the trained model.

    Args:
    lstm (LSTM): Trained lstm.
    pad_src_seqs of shape (max_src_seq_length, batch_size): Padded source sequences.
    src_seq_lengths: List of source sequence lengths.

    Returns:
    out_seqs of shape (batch_size, 1): LongTensor of word indices of the output sequences.
    """
    # YOUR CODE HERE
    with torch.no_grad():
        pad_src_seqs = pad_src_seqs.to(device)
        lstm_hidden = lstm.init_hidden(pad_src_seqs.shape[1], device)
        outputs = lstm(pad_src_seqs, src_seq_lengths, lstm_hidden)
        out_seqs = outputs > 0.5
        return out_seqs

In [234]:
def test_translate_shapes():
    pad_src_seqs = torch.tensor([
        [1, 2],
        [2, 3],
        [3, 0],
        [4, 0]
    ])

    out_seqs = classify(lstm, pad_src_seqs, src_seq_lengths=[4, 2])
    assert out_seqs.shape == torch.Size([2, 1]), f"Wrong out_seqs.shape: {out_seqs.shape}"
    print('Success')

test_translate_shapes()

Success


In [239]:
# Translate a few sentences from the training set
print('Classify training data:')
print('-----------------------------')
pad_src_seqs, src_seq_lengths, pad_tgt_seqs = next(iter(trainloader))
out_seqs = classify(lstm, pad_src_seqs, src_seq_lengths)

for i in range(5):
    print('SRC:', seq_to_tokens(pad_src_seqs[:,i], trainset.vocab))
    print('TGT:', pad_tgt_seqs[i].item())
    print('OUT:', out_seqs[i].item())
    print('')

Classify training data:
-----------------------------
SRC: ['when', 'you', 'come', 'from', 'a', 'totalitarian', 'society', 'that', 'is', 'very', 'tribal', 'and', 'that', 'tribalism', 'is', 'expressed', 'through', 'the', 'glue', 'of', 'religion', 'a', 'free', 'secular', 'society', 'like', 'that', 'in', 'the', 'west', 'is', 'viewed', 'quite', 'differently', '.', 'that', 'is', 'especially', 'so', 'when', 'the', 'prescripts', 'of', 'that', 'religion', 'deny', 'many', 'of', 'the', 'basic', 'human', 'rights', 'that', 'we', 'in', 'the', 'west', 'take', 'for', 'granted', '.', 'hence', 'we', 'are', 'subject', 'to', 'these', 'incomprehensible', 'terrorist', 'attacks', 'both', 'thru', 'new', 'immigrants', 'from', 'those', 'cultures', 'and', 'from', 'homegrown', 'terrorists', 'who', 'romantizes', 'the', 'whiff', 'of', '<unk>', 'century', 'cultures', '.', 'yet', 'here', 'in', 'the', 'west', 'we', 'are', 'being', 'brainwashed', 'by', 'those', 'who', 'profess', 'the', 'benefits', 'of', 'open', 'borde