In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks/SNLP/aalto-snlp-project-spring-2024"

/content/drive/.shortcut-targets-by-id/1B0EzOSjRZNGQDfkh7FcoUSO2itFQjxtV/aalto-snlp-project-spring-2024


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange, tqdm
import random
import os

In [4]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")

Using GPU: Tesla T4


# Data Preparation

In [5]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
df_train = pd.read_csv('train_2024.csv')
train_text = df_train['text'].values

def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(train_text)-1)
  table = np.array([tokenizer.tokenize(train_text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════════╤═════════════╕
│ Tokens       │   Token IDs │
╞══════════════╪═════════════╡
│ obama        │        8112 │
├──────────────┼─────────────┤
│ was          │        2001 │
├──────────────┼─────────────┤
│ an           │        2019 │
├──────────────┼─────────────┤
│ ass          │        4632 │
├──────────────┼─────────────┤
│ to           │        2000 │
├──────────────┼─────────────┤
│ romney       │       19615 │
├──────────────┼─────────────┤
│ with         │        2007 │
├──────────────┼─────────────┤
│ the          │        1996 │
├──────────────┼─────────────┤
│ whole        │        2878 │
├──────────────┼─────────────┤
│ candy        │        9485 │
├──────────────┼─────────────┤
│ crowley      │       20748 │
├──────────────┼─────────────┤
│ fia          │       19807 │
├──────────────┼─────────────┤
│ ##sco        │        9363 │
├──────────────┼─────────────┤
│ .            │        1012 │
├──────────────┼─────────────┤
│ christie     │       13144 │
├───────

In [7]:
class CustomDataset(TensorDataset):
	def __init__(self, tokenizer, csv_path, dataset_type, max_len=300):
		df = pd.read_csv(csv_path, quoting=3)
		self.text = df['text'].values
		self.ids = df['id'].tolist()
		if dataset_type == 'train' or dataset_type == 'val':
			labels = df['label'].values
			self.labels = torch.tensor(labels)
		else:
			self.labels = None
		self.dataset_type = dataset_type

		# preprocess the data
		self.token_id = []
		self.attention_masks = []

		for sample in tqdm(self.text):
			encoding_dict = preprocessing(sample, tokenizer)
			self.token_id.append(encoding_dict['input_ids'])
			self.attention_masks.append(encoding_dict['attention_mask'])

		self.token_id = torch.cat(self.token_id, dim = 0)
		self.attention_masks = torch.cat(self.attention_masks, dim = 0)

	def __getitem__(self, index):
		if self.dataset_type == 'train' or self.dataset_type == 'val':
			return self.token_id[index], self.attention_masks[index], self.labels[index], self.ids[index]
		else:
			return self.token_id[index], self.attention_masks[index], self.ids[index]

	def __len__(self):
		return len(self.text)

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 300,
						truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [8]:
# trainset = CustomDataset(tokenizer, 'train_2024.csv', dataset_type='train')
# valset = CustomDataset(tokenizer, 'dev_2024.csv', dataset_type='val')
# testset = CustomDataset(tokenizer, 'test_2024.csv', dataset_type='test')
# # Save the datasets
# torch.save(trainset, 'datasets/bert/trainset.pth')
# torch.save(valset, 'datasets/bert/valset.pth')
# torch.save(testset, 'datasets/bert/testset.pth')

In [9]:
# Load the datasets
trainset = torch.load('datasets/bert/trainset.pth')
valset = torch.load('datasets/bert/valset.pth')
testset = torch.load('datasets/bert/testset.pth')

In [14]:
# Prepare DataLoader
trainloader = DataLoader(
            trainset,
            sampler = RandomSampler(trainset),
            batch_size = 32
        )

valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 16
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 1
		)

# Training

In [11]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy, precision, recall, f1

## BERT

**RECOMMENDED config from the authors:**
```
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4
```

In [12]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Load finetuned model (after training)
print(f'Loading finetuned model...')
model.load_state_dict(torch.load('models/bert/bert_base_uncased_finetuned.pth', map_location=device))
print(f'Finetuned model loaded.')

# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading finetuned model...
Finetuned model loaded.


## RoBERTa


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08,
                              weight_decay=0.01
                              )

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Finetuning


In [None]:
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

total_steps = len(trainloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
										num_warmup_steps = 0, # Default value in run_glue.py
										num_training_steps = total_steps)

train_loss = []
val_loss = []
for i in trange(epochs, desc = 'Epoch'):
	print(f'\n-------Epoch {i}-------')
	# ========== Training ==========

	# Set model to training mode
	model.train()

	# Tracking variables
	tr_loss = 0
	nb_tr_examples, nb_tr_steps = 0, 0

	print('Training...')
	for step, batch in enumerate(trainloader):
		if step % 50 == 49:
			print(f'Batch: {step}/{len(trainloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		optimizer.zero_grad()
		# Forward pass
		train_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels = b_labels)
		# Backward pass
		train_output.loss.backward()
		optimizer.step()

		# Update the learning rate.
		scheduler.step()

		# Update tracking variables
		tr_loss += train_output.loss.item()
		nb_tr_examples += b_input_ids.size(0)
		nb_tr_steps += 1

	train_loss.append(tr_loss / nb_tr_steps)

	# Save the model
	# Create output directory if it doesn't exist
	if not os.path.exists('models/bert'):
		os.makedirs('models/bert')

	print(f'Saving model')
	torch.save(model.state_dict(), 'models/bert/roberta_finetuned.pth')
	print(f'Model saved in path models/bert/roberta_finetuned.pth')

	# ========== Validation ==========

	# Set model to evaluation mode
	model.eval()

	# Tracking variables
	val_accuracy = []
	val_precision = []
	val_recall = []
	val_f1 = []
	v_loss = 0
	nb_v_steps = 0
	print(f'Evaluation...')
	for batch_id, batch in enumerate(valloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(valloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels=b_labels)
		v_loss += eval_output.loss.item()
		nb_v_steps += 1

		# Move logits and labels to CPU
		logits = eval_output.logits.detach().cpu().numpy()
		preds = np.argmax(logits, axis=1).flatten()

		label_ids = b_labels.to('cpu').numpy()
		# Calculate validation metrics
		b_accuracy, b_precision, b_recall, b_f1 = compute_metrics(label_ids, preds)
		val_accuracy.append(b_accuracy)
		val_precision.append(b_precision)
		val_recall.append(b_recall)
		val_f1.append(b_f1)

	val_loss.append(v_loss / nb_v_steps)
	print('\n\t - Train loss: {:.4f}'.format(train_loss[-1]))
	print('\t - Val loss: {:.4f}'.format(val_loss[-1]))
	print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
	print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) )
	print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]


-------Epoch 0-------
Training...
Batch: 49/3094
Batch: 99/3094
Batch: 149/3094
Batch: 199/3094
Batch: 249/3094
Batch: 299/3094
Batch: 349/3094
Batch: 399/3094
Batch: 449/3094


Epoch:   0%|          | 0/2 [12:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# plot learning curve
import matplotlib.pyplot as plt
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.legend()
plt.show()

# Inference

## Batch Inference

In [None]:
valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 320
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 1
		)

In [15]:
def inference(model, output_filename, testloader):
	out = []
	indices = []
	for batch_id, batch in enumerate(testloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(testloader)}')
		batch = tuple(t.to(device) for t in batch)
		try:
			b_input_ids, b_input_mask, b_ids = batch
		except:
			b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								)
			preds = np.argmax(eval_output.logits.detach().cpu().numpy(), axis=1).flatten()
		try:
			out.extend(preds)
		except:
			out.append(preds)
		indices.extend(b_ids.cpu().numpy())

	df = pd.DataFrame({'id': indices, 'label': out})
	# convert label to int
	df['label'] = df['label'].astype(int)
	# sort by id
	df = df.sort_values(by='id')
	df.to_csv(output_filename, index=False)
	return df

inference(model, output_filename='outputs/dev_inference.csv', testloader=valloader)
# inference(model, output_filename='outputs/submission.csv', testloader=testloader)

Batch: 24/86
Batch: 49/86
Batch: 74/86


Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,1
10998,10998,1


In [17]:
!mv dev_inference.csv outputs/dev_inference_bert_based_uncased.csv

In [19]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    return precision, recall, f1

# Load the predicted csv
y_pred = pd.read_csv('outputs/dev_inference_bert_based_uncased.csv', index_col=0)
y_pred = y_pred['label'].tolist()

# Load the validation df
y_true = pd.read_csv('dev_2024.csv', quoting=3)
y_true = y_true['label'].tolist()

# Compute the metrics
precision, recall, f1 = compute_metrics(y_true, y_pred)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Precision: 0.9476102362100504
Recall: 0.9538437654367578
F1: 0.9505537974683544


## Ad-hoc inference

In [None]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

# RoBERTa


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# load tokenizer and model weights
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model.to(device)

# prepare the input
batch = tokenizer.encode(['you are sonofapollo'], return_tensors='pt').to(device)

# inference
output = model(batch)
np.argmax(output.logits.detach().cpu().numpy(), axis=1).flatten()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: parijatrai/s-nlp-roberta_toxicity_classifier is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# Predict
def inference(model, output_filename, testloader):
	out = []
	indices = []
	for batch_id, batch in enumerate(testloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(testloader)}')
		batch = tuple(t.to(device) for t in batch)
		try:
			b_input_ids, b_input_mask, b_ids = batch
		except:
			b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								)
			preds = np.argmax(eval_output.logits.detach().cpu().numpy(), axis=1).flatten()
		try:
			out.extend(preds)
		except:
			out.append(preds)
		indices.extend(b_ids.cpu().numpy())

	df = pd.DataFrame({'id': indices, 'label': out})
	# convert label to int
	df['label'] = df['label'].astype(int)
	# sort by id
	df = df.sort_values(by='id')
	df.to_csv(output_filename, index=False)
	return df

inference(model, output_filename='dev_inference.csv', testloader=valloader)
# inference(model, output_filename='submission.csv', testloader=testloader)

Batch: 24/688
Batch: 49/688
Batch: 74/688
Batch: 99/688
Batch: 124/688
Batch: 149/688
Batch: 174/688
Batch: 199/688
Batch: 224/688
Batch: 249/688
Batch: 274/688
Batch: 299/688
Batch: 324/688
Batch: 349/688
Batch: 374/688
Batch: 399/688
Batch: 424/688
Batch: 449/688
Batch: 474/688
Batch: 499/688
Batch: 524/688
Batch: 549/688
Batch: 574/688
Batch: 599/688
Batch: 624/688
Batch: 649/688
Batch: 674/688


Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,0
10998,10998,0
