In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# %cd "/content/drive/MyDrive/Colab Notebooks/SNLP/aalto-snlp-project-spring-2024"

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange, tqdm
import random
import os

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")

Using MPS


# Data Preparation

In [5]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [6]:
df_train = pd.read_csv('train_2024.csv')
train_text = df_train['text'].values

def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(train_text)-1)
  table = np.array([tokenizer.tokenize(train_text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ f        │        1042 │
├──────────┼─────────────┤
│ 8        │        1022 │
├──────────┼─────────────┤
│ 8        │        1022 │
├──────────┼─────────────┤
│ king     │        2332 │
├──────────┼─────────────┤
│ h        │        1044 │
├──────────┼─────────────┤
│ *        │        1008 │
├──────────┼─────────────┤
│ ll       │        2222 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ ain      │        7110 │
├──────────┼─────────────┤
│ '        │        1005 │
├──────────┼─────────────┤
│ t        │        1056 │
├──────────┼─────────────┤
│ this     │        2023 │
├──────────┼─────────────┤
│ grand    │        2882 │
├──────────┼─────────────┤
│ ?        │        1029 │
├──────────┼─────────────┤
│

In [7]:
class CustomDataset(TensorDataset):
	def __init__(self, tokenizer, csv_path, dataset_type, max_len=300):
		df = pd.read_csv(csv_path, quoting=3)
		self.text = df['text'].values
		self.ids = df['id'].tolist()
		if dataset_type == 'train' or dataset_type == 'val':
			labels = df['label'].values
			self.labels = torch.tensor(labels)
		else:
			self.labels = None
		self.dataset_type = dataset_type

		# preprocess the data
		self.token_id = []
		self.attention_masks = []

		for sample in tqdm(self.text):
			encoding_dict = preprocessing(sample, tokenizer)
			self.token_id.append(encoding_dict['input_ids'])
			self.attention_masks.append(encoding_dict['attention_mask'])

		self.token_id = torch.cat(self.token_id, dim = 0)
		self.attention_masks = torch.cat(self.attention_masks, dim = 0)

	def __getitem__(self, index):
		if self.dataset_type == 'train' or self.dataset_type == 'val':
			return self.token_id[index], self.attention_masks[index], self.labels[index], self.ids[index]
		else:
			return self.token_id[index], self.attention_masks[index], self.ids[index]

	def __len__(self):
		return len(self.text)

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 300,
						truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [8]:
# trainset = CustomDataset(tokenizer, 'train_2024.csv', dataset_type='train')
# valset = CustomDataset(tokenizer, 'dev_2024.csv', dataset_type='val')
# testset = CustomDataset(tokenizer, 'test_2024.csv', dataset_type='test')
# # Save the datasets
# torch.save(trainset, 'datasets/bert/trainset.pth')
# torch.save(valset, 'datasets/bert/valset.pth')
# torch.save(testset, 'datasets/bert/testset.pth')

In [9]:
# Load the datasets
trainset = torch.load('datasets/bert/trainset.pth')
valset = torch.load('datasets/bert/valset.pth')
testset = torch.load('datasets/bert/testset.pth')

In [10]:
# Prepare DataLoader
trainloader = DataLoader(
            trainset,
            sampler = RandomSampler(trainset),
            batch_size = 1
        )

valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 1
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 1
		)

# Training

In [11]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy, precision, recall, f1

## BERT

**RECOMMENDED config from the authors:**
```
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4
```

In [14]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Load finetuned model (after training)
print(f'Loading finetuned model...')
model.load_state_dict(torch.load('models/bert/bert_base_uncased_finetuned.pth', map_location=device))
print(f'Finetuned model loaded.')

# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading finetuned model...
Finetuned model loaded.


In [15]:
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

total_steps = len(trainloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
										num_warmup_steps = 0, # Default value in run_glue.py
										num_training_steps = total_steps)

train_loss = []
val_loss = []
for i in trange(epochs, desc = 'Epoch'):
	print(f'\n-------Epoch {i}-------')
	# ========== Training ==========

	# Set model to training mode
	model.train()

	# Tracking variables
	tr_loss = 0
	nb_tr_examples, nb_tr_steps = 0, 0

	print('Training...')
	for step, batch in enumerate(trainloader):
		if step % 50 == 49:
			print(f'Batch: {step}/{len(trainloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		optimizer.zero_grad()
		# Forward pass
		train_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels = b_labels)
		# Backward pass
		train_output.loss.backward()
		optimizer.step()

		# Update the learning rate.
		scheduler.step()

		# Update tracking variables
		tr_loss += train_output.loss.item()
		nb_tr_examples += b_input_ids.size(0)
		nb_tr_steps += 1

	train_loss.append(tr_loss / nb_tr_steps)

	# Save the model
	# Create output directory if it doesn't exist
	if not os.path.exists('models/bert'):
		os.makedirs('models/bert')

	print(f'Saving model')
	torch.save(model.state_dict(), 'models/bert/bert_base_uncased_finetuned.pth')
	print(f'Model saved in path models/bert/bert_base_uncased_finetuned.pth')

	# ========== Validation ==========

	# Set model to evaluation mode
	model.eval()

	# Tracking variables
	val_accuracy = []
	val_precision = []
	val_recall = []
	val_f1 = []
	v_loss = 0
	nb_v_steps = 0
	print(f'Evaluation...')
	for batch_id, batch in enumerate(valloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(valloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels=b_labels)
		v_loss += eval_output.loss.item()
		nb_v_steps += 1

		# Move logits and labels to CPU
		logits = eval_output.logits.detach().cpu().numpy()
		preds = np.argmax(logits, axis=1).flatten()

		label_ids = b_labels.to('cpu').numpy()
		# Calculate validation metrics
		b_accuracy, b_precision, b_recall, b_f1 = compute_metrics(label_ids, preds)
		val_accuracy.append(b_accuracy)
		val_precision.append(b_precision)
		val_recall.append(b_recall)
		val_f1.append(b_f1)

	val_loss.append(v_loss / nb_v_steps)
	print('\n\t - Train loss: {:.4f}'.format(train_loss[-1]))
	print('\t - Val loss: {:.4f}'.format(val_loss[-1]))
	print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
	print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) )
	print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]


-------Epoch 0-------
Training...
Batch: 49/99000
Batch: 99/99000
Batch: 149/99000
Batch: 199/99000
Batch: 249/99000
Batch: 299/99000
Batch: 349/99000
Batch: 399/99000
Batch: 449/99000
Batch: 499/99000
Batch: 549/99000
Batch: 599/99000
Batch: 649/99000
Batch: 699/99000
Batch: 749/99000
Batch: 799/99000
Batch: 849/99000
Batch: 899/99000
Batch: 949/99000
Batch: 999/99000
Batch: 1049/99000
Batch: 1099/99000
Batch: 1149/99000
Batch: 1199/99000
Batch: 1249/99000
Batch: 1299/99000
Batch: 1349/99000
Batch: 1399/99000
Batch: 1449/99000
Batch: 1499/99000
Batch: 1549/99000
Batch: 1599/99000
Batch: 1649/99000
Batch: 1699/99000
Batch: 1749/99000
Batch: 1799/99000
Batch: 1849/99000
Batch: 1899/99000
Batch: 1949/99000
Batch: 1999/99000
Batch: 2049/99000
Batch: 2099/99000
Batch: 2149/99000
Batch: 2199/99000
Batch: 2249/99000
Batch: 2299/99000
Batch: 2349/99000
Batch: 2399/99000
Batch: 2449/99000
Batch: 2499/99000
Batch: 2549/99000
Batch: 2599/99000
Batch: 2649/99000
Batch: 2699/99000
Batch: 2749/990

In [None]:
# plot learning curve
import matplotlib.pyplot as plt
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.legend()
plt.show()

# Inference

## Batch Inference

In [None]:
def inference(model, output_filename, testloader):
	out = []
	indices = []
	for batch_id, batch in enumerate(testloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(valloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								)
			preds = np.argmax(eval_output.logits.detach().cpu().numpy(), axis=1).flatten()
		try:
			out.extend(preds)
		except:
			out.append(preds)
		indices.extend(b_ids.cpu().numpy())

	df = pd.DataFrame({'id': indices, 'label': out})
	# convert label to int
	df['label'] = df['label'].astype(int)
	# sort by id
	df = df.sort_values(by='id')
	df.to_csv(output_filename, index=False)
	return df

# inference(model, output_filename='dev_inference.csv', testloader=valloader)
inference(model, output_filename='submission.csv', testloader=testloader)

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,0
3,3,1
4,4,1
5,5,1
6,6,0
7,7,0
8,8,1


## Ad-hoc inference

In [None]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Predicted Class:  Spam


