In [1]:
import pandas as pd
import numpy as np
from torch import nn
import torch

from transformers import (
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

from torch.utils.data import Dataset, DataLoader

In [2]:
data_path = 'data/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = 'cpu'

# Load data

In [3]:
train = pd.read_csv(data_path + 'train.csv')
val = pd.read_csv(data_path + 'val.csv')
test = pd.read_csv(data_path + 'test.csv')

train = train[train['label'] == 0]
val = val[val['label'] == 0]
test = test[test['label'] == 0]

In [4]:
sample_train = pd.read_csv(data_path + 'sample_train.csv')

# Model

In [68]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
import torch
import os

from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "thanathorn/mt5-cpe-kmutt-thai-sentence-sum"
tokenizer_name_or_path = "thanathorn/mt5-cpe-kmutt-thai-sentence-sum"

max_length = 128
lr = 5e-3
epoch_count =  15

peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

trainable params: 368640 || all params: 582769920 || trainable%: 0.06325652497644353




# New Dataset

In [69]:
class Impolite2PoliteDatasetWithPaddingLeft(Dataset):
    def __init__(self, X, y, tokenizer, max_length=128, device='cuda', padding_left=False):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
        self.padding_left = padding_left

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        input_ids = []
        attention_mask = []
        labels = []
        if self.padding_left:
            tokenized_x = self.tokenizer(self.X[idx], return_tensors='pt', truncation=True, max_length=self.max_length)
            tokenized_y = self.tokenizer(self.y[idx], return_tensors='pt', truncation=True, max_length=self.max_length)
            for i in range(len(tokenized_x['input_ids'])):
                concat_input = torch.cat((torch.zeros(128), tokenized_x['input_ids'][i]))
                input_ids.append(concat_input[-128:])
                concat_attention_mask = torch.cat((torch.zeros(128), tokenized_x['attention_mask'][i]))
                attention_mask.append(concat_attention_mask[-128:])
                concat_labels = torch.cat((torch.zeros(128), tokenized_y['input_ids'][i]))
                labels.append(concat_labels[-128:])
            input_ids = torch.stack(input_ids)
            attention_mask = torch.stack(attention_mask)
            labels = torch.stack(labels)
        else:
            tokenized_x = self.tokenizer(self.X[idx], return_tensors='pt', truncation=True, max_length=self.max_length, padding='max_length')
            tokenized_y = self.tokenizer(self.y[idx], return_tensors='pt', truncation=True, max_length=self.max_length, padding='max_length')
            input_ids = tokenized_x['input_ids']
            attention_mask = tokenized_x['attention_mask']
            labels = tokenized_y['input_ids']
            labels[labels == tokenizer.pad_token_id] = -100
        return {'input_ids': input_ids[0].to(device), 'attention_mask': attention_mask[0].to(device), 'labels': labels[0].to(device)}
        # return {'text': f'formalize: {self.X[idx]}', 'label': self.y[idx]}

In [70]:
impolite2polite_dataset = Impolite2PoliteDatasetWithPaddingLeft(sample_train['text'].tolist(), sample_train['clean'].tolist(), tokenizer, max_length=max_length, device=device)

In [71]:
impolite2polite_dataloader = DataLoader(impolite2polite_dataset, batch_size=1, shuffle=True)

In [64]:
next(iter(impolite2polite_dataloader))

{'input_ids': tensor([[   259,  47820, 219802, 198423,  28456,  36233,   7428,   4388,      1,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
   

# Training

In [72]:
# generation_model = GenerationModel(text_generation_model, text_generation_tokenizer)
# style_transfer_model = StyleTransferModel(len(tokenized_prefix[0]), 10, tokenized_prefix, generation_model.generate_text)
# style_transfer_model = style_transfer_model.to(device)
# style_transfer_model.prefix_model = style_transfer_model.prefix_model.to(device)

# for param in style_transfer_model.prefix_model.parameters():
#     param.requires_grad_()

from evaluate import load

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(impolite2polite_dataloader) * epoch_count),
)
# classification_criterion = nn.CrossEntropyLoss()
# bert_score = load("bertscore")
# bert_score_model = "bert-base-multilingual-cased"

In [74]:
from tqdm import tqdm

epochs = epoch_count

train_losses = []
val_losses = []
for epoch in range(epochs):
    train_loss = []
    print(f'Epoch {epoch}')
    model.train()
    for batch in tqdm(impolite2polite_dataloader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        inp = {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}
        output = model(**inp)
        loss = output.loss
        loss.backward()
        # check if gradients are non-zero
        # for name, param in style_transfer_model.named_parameters():
        #     if param.grad is not None:
        #         print(name, param.grad)
        #     else:
        #         print(name, 'None')
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss.item())
    # style_transfer_model.eval()
    # val_loss = []
    # for batch in tqdm(val_dataloader):
    #     encoding = text_generation_tokenizer(batch, return_tensors='pt', padding='max_length', truncation=True, max_length=256).input_ids
    #     encoding = encoding.to(device).float()
    #     output = style_transfer_model(encoding)
    #     loss = classification_loss(classification_model, classification_tokenizer, classification_criterion, output) + content_loss(bert_score, output, batch, bert_score_model)
    #     val_loss.append(loss.item())
    # # style_transfer_model.save_pretrained(f'./checkpoints/model/{epoch}')
    # torch.save(style_transfer_model.state_dict(), f'./checkpoints/model/{epoch}.pt')
    print(f'Epoch {epoch} train loss: {np.mean(train_loss)}')
    train_losses.append(np.mean(train_loss))

Epoch 0


100%|██████████| 100/100 [00:05<00:00, 18.27it/s]


Epoch 0 train loss: 3.456629946231842
Epoch 1


100%|██████████| 100/100 [00:05<00:00, 19.08it/s]


Epoch 1 train loss: 3.3461527466773986
Epoch 2


100%|██████████| 100/100 [00:05<00:00, 19.34it/s]


Epoch 2 train loss: 3.1647611433267593
Epoch 3


100%|██████████| 100/100 [00:05<00:00, 19.24it/s]


Epoch 3 train loss: 3.2160088127851485
Epoch 4


100%|██████████| 100/100 [00:05<00:00, 19.21it/s]


Epoch 4 train loss: 3.120749422311783
Epoch 5


100%|██████████| 100/100 [00:05<00:00, 19.32it/s]


Epoch 5 train loss: 3.037451884150505
Epoch 6


100%|██████████| 100/100 [00:05<00:00, 19.71it/s]


Epoch 6 train loss: 2.9584812819957733
Epoch 7


100%|██████████| 100/100 [00:05<00:00, 19.39it/s]


Epoch 7 train loss: 2.8917870575189593
Epoch 8


100%|██████████| 100/100 [00:05<00:00, 19.01it/s]


Epoch 8 train loss: 2.9222706896066666
Epoch 9


100%|██████████| 100/100 [00:05<00:00, 19.55it/s]


Epoch 9 train loss: 2.8867884743213654
Epoch 10


100%|██████████| 100/100 [00:05<00:00, 19.21it/s]


Epoch 10 train loss: 2.758167324066162
Epoch 11


100%|██████████| 100/100 [00:05<00:00, 19.35it/s]


Epoch 11 train loss: 2.8021338403224947
Epoch 12


100%|██████████| 100/100 [00:05<00:00, 19.55it/s]


Epoch 12 train loss: 2.748575673699379
Epoch 13


100%|██████████| 100/100 [00:05<00:00, 19.32it/s]


Epoch 13 train loss: 2.706773039996624
Epoch 14


100%|██████████| 100/100 [00:05<00:00, 19.74it/s]

Epoch 14 train loss: 2.7550100407004354





In [75]:
model.save_pretrained('./checkpoints/model/summarize_2')

In [76]:
from peft import PeftModel, PeftConfig

peft_model_id = './checkpoints/model/summarize_2'

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [84]:
model.to(device)
model.eval()
i = 4
word = ['แอคกูปะ', 'ตังออกวันไหน', 'พาไปหน่อยสิ อยากกิน']
for w in word:
    with torch.no_grad():
        inputs = tokenizer(w, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model.generate(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=50, early_stopping=True, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
        print(outputs)
        print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

tensor([[     0,    259,  98179,   4215, 212384, 167595,      1]],
       device='cuda:0')
['แมคกูปะ']
tensor([[    0,   259, 23045, 23248,  4682,     1]], device='cuda:0')
['ออกวันนี้']
tensor([[     0, 129967,  39350,  67641,  52638,  14166,      1]],
       device='cuda:0')
['อยากกินข้าวบ้างครับ']
