In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange, tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")

Using MPS


# Data Preparation

In [3]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [4]:
df_train = pd.read_csv('train_2024.csv')
train_text = df_train['text'].values

def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(train_text)-1)
  table = np.array([tokenizer.tokenize(train_text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒═══════════════╤═════════════╕
│ Tokens        │   Token IDs │
╞═══════════════╪═════════════╡
│ who           │        2040 │
├───────────────┼─────────────┤
│ are           │        2024 │
├───────────────┼─────────────┤
│ you           │        2017 │
├───────────────┼─────────────┤
│ to            │        2000 │
├───────────────┼─────────────┤
│ say           │        2360 │
├───────────────┼─────────────┤
│ they          │        2027 │
├───────────────┼─────────────┤
│ are           │        2024 │
├───────────────┼─────────────┤
│ a             │        1037 │
├───────────────┼─────────────┤
│ hate          │        5223 │
├───────────────┼─────────────┤
│ site          │        2609 │
├───────────────┼─────────────┤
│ ?             │        1029 │
├───────────────┼─────────────┤
│ ex            │        4654 │
├───────────────┼─────────────┤
│ ##tre         │        7913 │
├───────────────┼─────────────┤
│ ##mism        │       26725 │
├───────────────┼─────────────┤
│ ?     

In [5]:
class CustomDataset(TensorDataset):
	def __init__(self, tokenizer, csv_path, dataset_type, max_len=300):
		df = pd.read_csv(csv_path)
		text = df['text'].values
		if dataset_type == 'train' or dataset_type == 'val':
			labels = df['label'].values
			self.labels = torch.tensor(labels)
		else:
			self.labels = None
		self.dataset_type = dataset_type

		# preprocess the data
		self.token_id = []
		self.attention_masks = []

		for sample in tqdm(text):
			encoding_dict = preprocessing(sample, tokenizer)
			self.token_id.append(encoding_dict['input_ids'])
			self.attention_masks.append(encoding_dict['attention_mask'])

		self.token_id = torch.cat(self.token_id, dim = 0)
		self.attention_masks = torch.cat(self.attention_masks, dim = 0)

	def __getitem__(self, index):
		if self.dataset_type == 'train' or self.dataset_type == 'val':
			return self.token_id[index], self.attention_masks[index], self.labels[index]
		else:
			return self.token_id[index], self.attention_masks[index]

	def __len__(self):
		return len(self.labels)
	
def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 300,
						truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [6]:
# trainset = CustomDataset(tokenizer, 'train_2024.csv', dataset_type='train')
# valset = CustomDataset(tokenizer, 'dev_2024.csv', dataset_type='val')
# testset = CustomDataset(tokenizer, 'test_2024.csv', dataset_type='test')
# # Save the datasets
# torch.save(trainset, 'datasets/bert/trainset.pth')
# torch.save(valset, 'datasets/bert/valset.pth')
# torch.save(testset, 'datasets/bert/testset.pth')

In [7]:
# Load the datasets
trainset = torch.load('datasets/bert/trainset.pth')
valset = torch.load('datasets/bert/valset.pth')
testset = torch.load('datasets/bert/testset.pth')

In [8]:
# Prepare DataLoader
trainloader = DataLoader(
            trainset,
            sampler = RandomSampler(trainset),
            batch_size = 32
        )

valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 32
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 32
		)

# Training

In [9]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy_score = accuracy_score(y_true, y_pred)
    return accuracy_score, precision, recall, f1

## BERT

**RECOMMENDED config from the authors:**
```
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4
```

In [10]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
	
	# ========== Training ==========
	
	# Set model to training mode
	model.train()
	
	# Tracking variables
	tr_loss = 0
	nb_tr_examples, nb_tr_steps = 0, 0

	for step, batch in enumerate(trainloader):
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels = batch
		optimizer.zero_grad()
		# Forward pass
		loss, logits = model(b_input_ids, 
							token_type_ids = None, 
							attention_mask = b_input_mask, 
							labels = b_labels)
		# Backward pass
		loss.backward()
		optimizer.step()
		# Update tracking variables
		tr_loss += loss.item()
		nb_tr_examples += b_input_ids.size(0)
		nb_tr_steps += 1

	# ========== Validation ==========

	# Set model to evaluation mode
	model.eval()

	# Tracking variables 
	val_accuracy = []
	val_precision = []
	val_recall = []
	val_f1 = []

	for batch in valloader:
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels = batch
		with torch.no_grad():
			# Forward pass
			loss, logits = model(b_input_ids, 
								token_type_ids = None, 
								attention_mask = b_input_mask)
		
		# Move logits and labels to CPU
		logits = logits.detach().cpu().numpy()
		preds = np.argmax(logits, axis=1).flatten()
		
		label_ids = b_labels.to('cpu').numpy()
		# Calculate validation metrics
		b_accuracy, b_precision, b_recall, b_f1 = compute_metrics(preds, logits)
		val_accuracy.append(b_accuracy)
		val_precision.append(b_precision)
		val_recall.append(b_recall)
		val_f1.append(b_f1)
		

	print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
	print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
	print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) )
	print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))

Epoch:   0%|          | 0/2 [00:03<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 17.88 GB, other allocations: 236.70 MB, max allowed: 18.13 GB). Tried to allocate 28.12 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).