In [2]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/mnt/swordfish-datastore/wl2787/huggingface_cache/"
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd

[2023-11-28 23:40:20,450] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
class Seq2SeqDataset(Dataset):
    def __init__(self, input_texts, target_texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }


In [4]:
checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [5]:
train_data_dir = "../data/sentiment_explanation/english/train_cleaned.csv"
test_data_dir = "../data/sentiment_explanation/english/test_cleaned.csv"
# train_data_dir = "../data/train.csv"
# test_data_dir = "../data/train.csv"

train = pd.read_csv(train_data_dir)
valid = pd.read_csv(test_data_dir)

train_input_texts = list(train["input"])
train_target_texts = list(train["output"])
valid_input_texts = list(valid["input"])
valid_target_texts = list(valid["output"])

# Initialize datasets
train_dataset = Seq2SeqDataset(train_input_texts, train_target_texts, tokenizer, max_length=512)
valid_dataset = Seq2SeqDataset(valid_input_texts, valid_target_texts, tokenizer, max_length=512)

In [13]:
from torch.utils.data import DataLoader

# Assuming you have defined train_dataset as per your existing code
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="../results",          
    num_train_epochs=3,              
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_dir='../results/logs',
    report_to=None,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./fine_tuned_model")
