In [7]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset

In [9]:
model_name = "Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [10]:
tokenizer.pad_token = tokenizer.eos_token

In [11]:
df = pd.read_csv("data.csv")

In [12]:
def preprocess_data(example):
    source = example["prompt"]
    target = example["completion"]
    
    source_tokenized = tokenizer(source, truncation=True, padding='max_length', max_length=2048)
    target_tokenized = tokenizer(target, truncation=True, padding='max_length', max_length=2048)
    
    return {
        "input_ids": source_tokenized["input_ids"],
        "attention_mask": source_tokenized["attention_mask"],
        "labels": target_tokenized["input_ids"],
    }

In [14]:
hf_dataset = Dataset.from_pandas(df)

In [15]:
preprocessed_dataset = hf_dataset.map(preprocess_data, batched=True)


Map:   0%|                                       | 0/500 [00:00<?, ? examples/s][A
Map: 100%|████████████████████████████| 500/500 [00:00<00:00, 942.99 examples/s][A
                                                                                [A

In [16]:
train_dataset = preprocessed_dataset.train_test_split(test_size=0.1)["train"]
val_dataset = preprocessed_dataset.train_test_split(test_size=0.1)["test"]

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_codegen_350M_multi",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [19]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: prompt, completion, Unnamed: 0. If prompt, completion, Unnamed: 0 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 450
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 339
  Number of trainable parameters = 222882048
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Downloading pytorch_model.bin:  24%|█▉      | 189M/797M [25:01<1:20:42, 126kB/s]


KeyboardInterrupt: 