In [1]:
!pip install --upgrade urllib3

Requirement already up-to-date: urllib3 in /home/omar/.local/lib/python3.8/site-packages (2.1.0)


In [2]:
!pip install -q peft transformers datasets evaluate

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Replace "0" with the index of the GPU you want to use
# os.environ["CUDA_LAUNCH_BLOCKING"]= "1"

In [34]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
)
from datasets import load_dataset
import evaluate
import torch

features = ['Non-Hate','Hate Speech']

model_name_or_path = "roberta-base"
num_epochs = 5
lr = 1e-3
batch_size = 4

In [35]:
# dataset = load_dataset("glue", task)
# dataset["train"][0]
# {
#     "sentence1": 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
#     "sentence2": 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
#     "label": 1,
#     "idx": 0,
# }

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset,DatasetDict, load_dataset

df = pd.read_csv('../data/MHS/mhs_preprocessed_data.csv')
df = df.dropna()
train, test = train_test_split(df, test_size=0.2)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
ds_dict = {'train' : train,
           'test' : test}
dataset = DatasetDict(ds_dict)

In [37]:
dataset = dataset.map(
    lambda x: {"text_label": [features[label] for label in x["HS"]]},
    batched=True,
    num_proc=1,
)

Map:   0%|          | 0/108444 [00:00<?, ? examples/s]

Map:   0%|          | 0/27111 [00:00<?, ? examples/s]

In [38]:
dataset["train"]


Dataset({
    features: ['text', 'HS', '__index_level_0__', 'text_label'],
    num_rows: 108444
})

In [39]:
metric = evaluate.load('accuracy')

Using the latest cached version of the module from /home/omar/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Jan 15 21:28:09 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [40]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [41]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    # print( examples)
    outputs = tokenizer(examples["text"], truncation=True, max_length=128)
    return outputs

In [42]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "text_label", "__index_level_0__"],
)

tokenized_datasets = tokenized_datasets.rename_column("HS", "labels")

Map:   0%|          | 0/108444 [00:00<?, ? examples/s]

Map:   0%|          | 0/27111 [00:00<?, ? examples/s]

In [43]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [44]:
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)

In [45]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"trainable params: 1351938 || all params: 355662082 || trainable%: 0.38011867680626127"

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 821,506 || all params: 125,468,676 || trainable%: 0.654749875578507


'trainable params: 1351938 || all params: 355662082 || trainable%: 0.38011867680626127'

In [46]:
training_args = TrainingArguments(
    output_dir="your-name/roberta-large-peft-p-tuning",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3925,0.356788,0.872598
2,0.3346,0.33754,0.875696


TrainOutput(global_step=54222, training_loss=0.40081109128581055, metrics={'train_runtime': 2705.4781, 'train_samples_per_second': 80.166, 'train_steps_per_second': 20.042, 'total_flos': 6557946703595520.0, 'train_loss': 0.40081109128581055, 'epoch': 2.0})

In [53]:
# classes = ["not equivalent", "equivalent"]

sentence1 = "these niggers straight bitches"
sentence2 = ""

inputs = tokenizer(sentence1, truncation=False, return_tensors="pt")

inputs.to('cuda')

{'input_ids': tensor([[    0, 29902, 44831,  7188,  1359,   828,  5559,     2]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [54]:
with torch.no_grad():
    outputs = model(**inputs).logits
    print(outputs)

paraphrased_text = torch.softmax(outputs, dim=1).tolist()[0]
for i in range(len(features)):
    print(f"{features[i]}: {int(round(paraphrased_text[i] * 100))}%")
"not equivalent: 4%"
"equivalent: 96%"

tensor([[ 0.7312, -0.7071]], device='cuda:0')
Non-Hate: 81%
Hate Speech: 19%


'equivalent: 96%'