In [50]:
from datasets import load_dataset

ds =  load_dataset('csv', data_files={'train': "augmented.csv",
                                             'test': 'testing_data.csv'})

label_names = sorted(set(labels for labels in ds["train"]["text_label"]))
#classes = [k.replace("_", " ") for k in ds["train"].features["label"].names]

classes = label_names
ds = ds.map(
    batched=True,
    num_proc=1,
)
#classes

In [37]:
ds = load_dataset("ought/raft", "twitter_complaints")
ds["train"].column_names

['Tweet text', 'ID', 'Label']

In [51]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-large")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
#classes[33]
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)


6


In [70]:
import torch

max_length = 518

def preprocess_function(examples, text_column="sentence", label_column="text_label"):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    #model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    #labels = tokenizer(targets, max_length=4, padding="max_length", truncation=True, return_tensors="pt")
    model_inputs = tokenizer(inputs,  padding='max_length', truncation=True, max_length=max_length)
    labels = tokenizer(targets,  padding='target_max_length', truncation=True, max_length=target_max_length)
    #classes = [k.replace("_", " ") for k in ds["train"].features["label"].names]
    classes = sorted(set(labels for labels in ds["train"]["text_label"]))

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [71]:
processed_ds = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=['label','text_label',"sentence"],
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|                                                                                                                   | 0/1654 [00:00<?, ? examples/s]


ValueError: target_max_length is not a valid PaddingStrategy, please select one of ['longest', 'max_length', 'do_not_pad']

In [72]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_ds = processed_ds["train"]
eval_ds = processed_ds["test"]

batch_size = 16

train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [73]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("roberta-large")

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [67]:
from peft import PrefixTuningConfig, get_peft_model

peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014"

trainable params: 983,040 || all params: 356,395,097 || trainable%: 0.2758287104045093


'trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014'

In [68]:
from transformers import get_linear_schedule_with_warmup

lr = 3e-2
num_epochs = 50

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [69]:
from tqdm import tqdm

device = "cuda"
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|                                                                                                                                                         | 0/104 [00:01<?, ?it/s]


RuntimeError: The expanded size of the tensor (518) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [16, 518].  Tensor sizes: [1, 514]