In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd "/content/drive/MyDrive/Colab Notebooks/SNLP/aalto-snlp-project-spring-2024"

/content/drive/.shortcut-targets-by-id/1B0EzOSjRZNGQDfkh7FcoUSO2itFQjxtV/aalto-snlp-project-spring-2024


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange, tqdm
import random
import os

In [4]:
if torch.cuda.is_available():
	print(f'Using GPU: {torch.cuda.get_device_name(0)}')
	device = torch.device("cuda")
elif torch.backends.mps.is_available():
	print('Using MPS')
	device = torch.device("mps")
else:
	print('Using CPU')
	device = torch.device("cpu")

Using GPU: Tesla T4


# Data Preparation

In [7]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
df_train = pd.read_csv('train_2024.csv')
train_text = df_train['text'].values

def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(train_text)-1)
  table = np.array([tokenizer.tokenize(train_text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒═══════════════╤═════════════╕
│ Tokens        │   Token IDs │
╞═══════════════╪═════════════╡
│ jennifer      │        7673 │
├───────────────┼─────────────┤
│ rubin         │       20524 │
├───────────────┼─────────────┤
│ conservative  │        4603 │
├───────────────┼─────────────┤
│ washington    │        2899 │
├───────────────┼─────────────┤
│ post          │        2695 │
├───────────────┼─────────────┤
│ columnist     │       13317 │
├───────────────┼─────────────┤
│ :             │        1024 │
├───────────────┼─────────────┤
│ "             │        1000 │
├───────────────┼─────────────┤
│ the           │        1996 │
├───────────────┼─────────────┤
│ trump         │        8398 │
├───────────────┼─────────────┤
│ obsession     │       17418 │
├───────────────┼─────────────┤
│ with          │        2007 │
├───────────────┼─────────────┤
│ “             │        1523 │
├───────────────┼─────────────┤
│ un            │        4895 │
├───────────────┼─────────────┤
│ ##mas 

In [9]:
class CustomDataset(TensorDataset):
	def __init__(self, tokenizer, csv_path, dataset_type, max_len=300):
		df = pd.read_csv(csv_path, quoting=3)
		self.text = df['text'].values
		self.ids = df['id'].tolist()
		if dataset_type == 'train' or dataset_type == 'val':
			labels = df['label'].values
			self.labels = torch.tensor(labels)
		else:
			self.labels = None
		self.dataset_type = dataset_type

		# preprocess the data
		self.token_id = []
		self.attention_masks = []

		for sample in tqdm(self.text):
			encoding_dict = preprocessing(sample, tokenizer)
			self.token_id.append(encoding_dict['input_ids'])
			self.attention_masks.append(encoding_dict['attention_mask'])

		self.token_id = torch.cat(self.token_id, dim = 0)
		self.attention_masks = torch.cat(self.attention_masks, dim = 0)

	def __getitem__(self, index):
		if self.dataset_type == 'train' or self.dataset_type == 'val':
			return self.token_id[index], self.attention_masks[index], self.labels[index], self.ids[index]
		else:
			return self.token_id[index], self.attention_masks[index], self.ids[index]

	def __len__(self):
		return len(self.text)

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 300,
						truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [10]:
# trainset = CustomDataset(tokenizer, 'train_2024.csv', dataset_type='train')
# valset = CustomDataset(tokenizer, 'dev_2024.csv', dataset_type='val')
# testset = CustomDataset(tokenizer, 'test_2024.csv', dataset_type='test')
# # Save the datasets
# torch.save(trainset, 'datasets/bert/trainset.pth')
# torch.save(valset, 'datasets/bert/valset.pth')
# torch.save(testset, 'datasets/bert/testset.pth')

In [11]:
# Load the datasets
trainset = torch.load('datasets/bert/trainset.pth')
valset = torch.load('datasets/bert/valset.pth')
testset = torch.load('datasets/bert/testset.pth')

In [12]:
# Prepare DataLoader
trainloader = DataLoader(
            trainset,
            sampler = RandomSampler(trainset),
            batch_size = 32
        )

valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 16
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 1
		)

# Training

In [13]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy, precision, recall, f1

## Focal Loss


In [14]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self,
                 gamma: float = 2,
                 reduction: str = "mean",
                 pos_weight: torch.Tensor = None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs: torch.Tensor,
                targets: torch.Tensor):
        # Convert the targets to one-hot encoding
        targets_one_hot = torch.zeros_like(inputs)
        targets_one_hot.scatter_(1, targets.unsqueeze(1).to(torch.int64), 1)

        # Compute the binary cross-entropy loss
        ce_loss = F.binary_cross_entropy_with_logits(
            inputs, targets_one_hot, reduction="none", pos_weight=self.pos_weight
        )

        # Compute the focal loss
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == "mean":
            focal_loss = focal_loss.mean()
        elif self.reduction == "sum":
            focal_loss = focal_loss.sum()

        return focal_loss

## BERT

**RECOMMENDED config from the authors:**
```
The optimal hyperparameter values are task-specific, but we found the following range of possible values to work well across all tasks:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4
```

In [15]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Load finetuned model (after training)
print(f'Loading finetuned model...')
model.load_state_dict(torch.load('models/bert/bert_base_uncased_finetuned.pth', map_location=device))
print(f'Finetuned model loaded.')

# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

epochs = 2

total_steps = len(trainloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
										num_warmup_steps = total_steps * 0.1, # Default value in run_glue.py
										num_training_steps = total_steps)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading finetuned model...
Finetuned model loaded.


## RoBERTa


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
# Run on GPU if available
model.to(device)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08,
                              weight_decay=0.01
                              )

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## XLM-RoBERTa


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# Load the XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base')

# Run on GPU if available
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 2e-5,
                              eps = 1e-08,
                              weight_decay=0.01
                              )

epochs = 2
total_steps = len(trainloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=total_steps * 0.1, # 10% of train steps for warm-up
                                            num_training_steps=total_steps)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Finetuning


In [None]:
# # Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
# epochs = 2

# total_steps = len(trainloader) * epochs

# # Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(optimizer,
# 										num_warmup_steps = 0, # Default value in run_glue.py
# 										num_training_steps = total_steps)

# Use focal loss
pos_weight = torch.tensor([1.722248]).to(device)  # Adjust the weight as needed
criterion = FocalLoss(gamma=2.0, reduction='mean', pos_weight=pos_weight)

train_loss = []
val_loss = []

for i in trange(epochs, desc = 'Epoch'):
	print(f'\n-------Epoch {i}-------')
	# ========== Training ==========

	# Set model to training mode
	model.train()

	# Tracking variables
	tr_loss = 0
	nb_tr_examples, nb_tr_steps = 0, 0

	print('Training...')
	for step, batch in enumerate(trainloader):
		if step % 50 == 49:
			print(f'Batch: {step}/{len(trainloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		optimizer.zero_grad()
		# Forward pass
		train_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels = b_labels)

		# # Use default loss
		# loss = train_output.loss
		# Use custom loss
		loss = criterion(train_output.logits, b_labels)

		# Backward pass
		loss.backward()
		optimizer.step()

		# Update the learning rate.
		scheduler.step()

		# Update tracking variables
		tr_loss += train_output.loss.item()
		nb_tr_examples += b_input_ids.size(0)
		nb_tr_steps += 1

	train_loss.append(tr_loss / nb_tr_steps)

	# Save the model
	# Create output directory if it doesn't exist
	if not os.path.exists('models/bert'):
		os.makedirs('models/bert')

	print(f'Saving model')
	torch.save(model.state_dict(), '/content/roberta_finetuned.pth')
	print(f'Model saved in path /content/roberta_finetuned.pth')

	# ========== Validation ==========

	# Set model to evaluation mode
	model.eval()

	# Tracking variables
	val_accuracy = []
	val_precision = []
	val_recall = []
	val_f1 = []
	v_loss = 0
	nb_v_steps = 0
	print(f'Evaluation...')
	for batch_id, batch in enumerate(valloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(valloader)}')
		batch = tuple(t.to(device) for t in batch)
		b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								labels=b_labels)

		# Use default loss
		# loss = eval_output.loss
		# Use custom loss
		loss = criterion(eval_output.logits, b_labels)

		v_loss += loss.item()
		nb_v_steps += 1

		# Move logits and labels to CPU
		logits = eval_output.logits.detach().cpu().numpy()
		preds = np.argmax(logits, axis=1).flatten()

		label_ids = b_labels.to('cpu').numpy()
		# Calculate validation metrics
		b_accuracy, b_precision, b_recall, b_f1 = compute_metrics(label_ids, preds)
		val_accuracy.append(b_accuracy)
		val_precision.append(b_precision)
		val_recall.append(b_recall)
		val_f1.append(b_f1)

	val_loss.append(v_loss / nb_v_steps)
	print('\n\t - Train loss: {:.4f}'.format(train_loss[-1]))
	print('\t - Val loss: {:.4f}'.format(val_loss[-1]))
	print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
	print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) )
	print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)))

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]


-------Epoch 0-------
Training...
Batch: 49/3094
Batch: 99/3094
Batch: 149/3094
Batch: 199/3094


In [None]:
# plot learning curve
import matplotlib.pyplot as plt
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='val loss')
plt.legend()
plt.show()

# Inference

## Batch Inference

In [None]:
valloader = DataLoader(
            valset,
            sampler = SequentialSampler(valset),
            batch_size = 320
        )

testloader = DataLoader(
			testset,
			sampler = SequentialSampler(testset),
			batch_size = 1
		)

In [None]:
def inference(model, output_filename, testloader):
	out = []
	indices = []
	for batch_id, batch in enumerate(testloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(testloader)}')
		batch = tuple(t.to(device) for t in batch)
		try:
			b_input_ids, b_input_mask, b_ids = batch
		except:
			b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								)
			preds = np.argmax(eval_output.logits.detach().cpu().numpy(), axis=1).flatten()
		try:
			out.extend(preds)
		except:
			out.append(preds)
		indices.extend(b_ids.cpu().numpy())

	df = pd.DataFrame({'id': indices, 'label': out})
	# convert label to int
	df['label'] = df['label'].astype(int)
	# sort by id
	df = df.sort_values(by='id')
	df.to_csv(output_filename, index=False)
	return df

inference(model, output_filename='outputs/dev_inference.csv', testloader=valloader)
# inference(model, output_filename='outputs/submission.csv', testloader=testloader)

Batch: 24/86
Batch: 49/86
Batch: 74/86


Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,1
10998,10998,1


In [None]:
!mv dev_inference.csv outputs/dev_inference_bert_based_uncased.csv

In [None]:
# import precision_score, recall_score, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
# Compute Precision, Recall, and F1 Score of the imported predicted csv and the validation df
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    return precision, recall, f1

# Load the predicted csv
y_pred = pd.read_csv('outputs/dev_inference_bert_based_uncased.csv', index_col=0)
y_pred = y_pred['label'].tolist()

# Load the validation df
y_true = pd.read_csv('dev_2024.csv', quoting=3)
y_true = y_true['label'].tolist()

# Compute the metrics
precision, recall, f1 = compute_metrics(y_true, y_pred)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Precision: 0.9476102362100504
Recall: 0.9538437654367578
F1: 0.9505537974683544


## Ad-hoc inference

In [None]:
new_sentence = 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

# RoBERTa


In [None]:
unitary/unbiased-toxic-roberta

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# load tokenizer and model weights
tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
model.to(device)

# prepare the input
batch = tokenizer.encode(['you are sunshine'], return_tensors='pt').to(device)

# inference
output = model(batch)
print(output)
np.argmax(output.logits.detach().cpu().numpy(), axis=1).flatten()


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SequenceClassifierOutput(loss=None, logits=tensor([[ 4.8771, -5.2565]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


array([0])

In [None]:
# Predict
def inference(model, output_filename, testloader):
	out = []
	indices = []
	for batch_id, batch in enumerate(testloader):
		if batch_id % 25 == 24:
			print(f'Batch: {batch_id}/{len(testloader)}')
		batch = tuple(t.to(device) for t in batch)
		try:
			b_input_ids, b_input_mask, b_ids = batch
		except:
			b_input_ids, b_input_mask, b_labels, b_ids = batch
		with torch.no_grad():
			# Forward pass
			eval_output = model(b_input_ids,
								token_type_ids = None,
								attention_mask = b_input_mask,
								)
			preds = np.argmax(eval_output.logits.detach().cpu().numpy(), axis=1).flatten()
		try:
			out.extend(preds)
		except:
			out.append(preds)
		indices.extend(b_ids.cpu().numpy())

	df = pd.DataFrame({'id': indices, 'label': out})
	# convert label to int
	df['label'] = df['label'].astype(int)
	# sort by id
	df = df.sort_values(by='id')
	df.to_csv(output_filename, index=False)
	return df

inference(model, output_filename='dev_inference.csv', testloader=valloader)
# inference(model, output_filename='submission.csv', testloader=testloader)

Batch: 24/688
Batch: 49/688
Batch: 74/688
Batch: 99/688
Batch: 124/688
Batch: 149/688
Batch: 174/688
Batch: 199/688
Batch: 224/688
Batch: 249/688
Batch: 274/688
Batch: 299/688
Batch: 324/688
Batch: 349/688
Batch: 374/688
Batch: 399/688
Batch: 424/688
Batch: 449/688
Batch: 474/688
Batch: 499/688
Batch: 524/688
Batch: 549/688
Batch: 574/688
Batch: 599/688
Batch: 624/688
Batch: 649/688
Batch: 674/688


Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,0
10998,10998,0


# Mistral


In [None]:
!pip install -q accelerate==0.21.0 \
                peft==0.4.0 \
                bitsandbytes==0.40.2 \
                transformers==4.31.0 \
                trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m143.4/244.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [None]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
Successfully installed tokenizers-0.15.2 transformers-4.39.3


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" # the device to load the model onto

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    trust_remote_code=True,
    device_map="auto",
    load_in_8bit=True,# For 8 bit quantization,
    max_memory={0:"15GB"})
model.eval()
model = torch.compile(model, mode = "max-autotune", backend="inductor")

In [None]:
import re

def create_message(text):
  prompt = f"""
  Classify the following text into toxic or non toxic.
  Output in this format:
  ```
  {{'label': 0}}
  ```
   with 0 for non toxic and 1 for toxic
  Sentence: '{text}'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
  """

  messages = [
    {"role": "user", "content": prompt},
  ]
  return messages

def inference(text):
  messages = create_message(text)
  encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

  model_inputs = encodeds.to(device)
  model.to(device)

  generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
  decoded = tokenizer.batch_decode(generated_ids)

  # Postprocessing
  text = decoded[0]
  print(f'raw text: {text}')
  answer = text.split('[/INST]')[-1]
  answer = answer.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
  answer = re_transform(answer)
  try:
    print(f'answer: {answer}')
    post_process_response = eval(answer)
    print(f'post process: {post_process_response}')
  except Exception as e:
    print(f'Exception: {e}')
    post_process_response = {
        'text': text,
        'label': 0
    }
  return post_process_response

def re_transform(text):
  # Define the pattern
  pattern = r"\{'label': .*}"

  # Find the first match
  match = re.search(pattern, text)

  # Extract the match
  if match:
    first_match = match.group()
    return first_match
  else:
    return None


In [None]:
output = inference("was an hour before kicked to death by the savage and brutal mob.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: 'was an hour before kicked to death by the savage and brutal mob.'
  Bind strictly to the output format, do not generate any text apart from that.
  Be sensible, if something touches sensible topics like death, someone's life was taken or anything related.
   [/INST] {'text': 'was an hour before kicked to death by the savage and brutal mob.', 'label': 1}</s>
answer: {'text': 'was an hour before kicked to death by the savage and brutal mob.', 'label': 1}
post process: {'text': 'was an hour before kicked to death by the savage and brutal mob.', 'label': 1}


In [None]:
print(output['text'])
print(output['label'])

was an hour before kicked to death by the savage and brutal mob.
1


In [None]:
def inference_batch(df, output_filename):
  texts = df['text'].tolist()
  ids = df['id'].tolist()
  preds = []
  indices = []
  for id, text in tqdm(zip(ids, texts)):
    output = inference(text)
    preds.append(output['label'])
    indices.append(id)
    if len(indices) > 5:
      break

  df = pd.DataFrame({'id': indices, 'label': preds})
  # convert label to int
  df['label'] = df['label'].astype(int)
  # sort by id
  df = df.sort_values(by='id')
  df.to_csv(output_filename, index=False)
  return df

In [None]:
val_df = pd.read_csv('failed_indices.csv', quoting=3)
inference_batch(val_df, 'outputs/mistral_dev_inference.csv')

0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
1it [00:15, 15.20s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: '���I ain���t drunk.��� ���Then you���re mad; that���s worse'.'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': "'I ain't drunk. Then you're mad; that's worse.'", 'label': 0}</s>
answer: {'text': "'I ain't drunk. Then you're mad; that's worse.'", 'label': 0}
post process: {'text': "'I ain't drunk. Then you're mad; that's worse.'", 'label': 0}


2it [00:22, 10.76s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: 'Mm. That tastes like balls.'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': "'Mm. That tastes like balls.'", 'label': 0}</s>
answer: {'text': "'Mm. That tastes like balls.'", 'label': 0}
post process: {'text': "'Mm. That tastes like balls.'", 'label': 0}


3it [00:37, 12.53s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: 'Do you fell better now  ready to put the pacifier back in your mouth?'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': 'Do you fell better now  ready to put the pacifier back in your mouth?', 'label': 0}</s>
answer: {'text': 'Do you fell better now  ready to put the pacifier back in your mouth?', 'label': 0}
post process: {'text': 'Do you fell better now  ready to put the pacifier back in your mouth?', 'label': 0}


4it [00:51, 13.04s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: 'He does that for the little fat boy who orders these movies off of Netflix.  That way he knows who is boss!  Make my day jman! lol!'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': 'He does that for the little fat boy who orders these movies off of Netflix. That way he knows who is boss! Make my day jman! lol!', 'label': 0}</s>
answer: {'text': 'He does that for the little fat boy who orders these movies off of Netflix. That way he knows who is boss! Make my day jman! lol!', 'label': 0}
post process: {'text': 'He does that for the little fat boy who orders these movies off of Netflix. That way he knows who is boss! Make my day jman! lol!', 'label': 0}


5it [01:11, 15.53s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: '"Wow. What a hateful person you seem to be. I thought ""young"" people didn't like labeling people. Someday your so called status symbols will be ""tired""."'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': '"Wow. What a hateful person you seem to be. I thought ""young""" people didn't like labeling people. Someday your so called status symbols will be ""tired""."', 'label': 1}</s>
answer: {'text': '"Wow. What a hateful person you seem to be. I thought ""young""" people didn't like labeling people. Someday your so called status symbols will be ""tired""."', 'label': 1}
Exception: unterminated string literal (detected at line 1) (<string>, line 1)


5it [01:20, 16.03s/it]

raw text: <s> [INST] 
  Classify the following text into toxic or non toxic.
  Output in this format: {'text': <text>, 'label': 0} with 0 for non toxic and 1 for toxic
  Sentence: 'The Trump base: deluded and delusional.'
  Bind strictly to the output format, do not generate any text apart from that.
  Imagine you are a sensitive and vulnerable person that could spot out toxicity even it's not too clear.
   [/INST] {'text': 'The Trump base: deluded and delusional.', 'label': 1}</s>
answer: {'text': 'The Trump base: deluded and delusional.', 'label': 1}
post process: {'text': 'The Trump base: deluded and delusional.', 'label': 1}





Unnamed: 0,id,label
0,31,0
1,49,0
2,51,0
3,57,0
4,61,0
5,63,1
