In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_path = 'data/'

### load data

In [None]:
train = pd.read_csv(data_path + 'train.csv')
val = pd.read_csv(data_path + 'val.csv')
test = pd.read_csv(data_path + 'test.csv')

### Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class PoliteDataset(Dataset):
    def __init__(self, X, y):
        self.text = X
        self.targets = y

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        target = int(self.targets[idx])

        return text, target

In [None]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.to(device)

### second dataloaders

In [None]:
polite_train_dataset = PoliteDataset(train['text'].tolist(), train['label'].tolist())
train_loader = DataLoader(polite_train_dataset, batch_size=8, shuffle=True)
polite_val_dataset = PoliteDataset(val['text'].tolist(), val['label'].tolist())
val_loader = DataLoader(polite_val_dataset, batch_size=8, shuffle=True)
polite_test_dataset = PoliteDataset(test['text'].tolist(), test['label'].tolist())
test_loader = DataLoader(polite_test_dataset, batch_size=8)

### train

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
from tqdm import tqdm

In [None]:
epochs = 10

for epoch in range(epochs):
    train_acc = 0
    print(f'Epoch {epoch}')
    for batch in tqdm(train_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_acc += (output.logits.argmax(dim=1) == target).sum()
    model.eval()
    test_acc = 0
    for batch in tqdm(test_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        test_acc += (output.logits.argmax(dim=1) == target).sum()
    model.save_pretrained(f'./checkpoints/classifier/new_{epoch}')
    print(f'Epoch {epoch} loss: {loss.item()} train accuracy: {train_acc/len(polite_train_dataset)} test accuracy: {test_acc/len(polite_test_dataset)}')

In [None]:
from sklearn.metrics import classification_report

def evaluate(model, test_loader):
    y_pred = []
    y_test = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader):
            text, target = batch
            encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
            output = model(**encoding, labels=target)

            pred = output.logits.argmax(dim=1)
            y_pred += pred.tolist()
            y_test += target.tolist()
    
    report = classification_report(y_pred, y_test)
    return report

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('./checkpoints/classifier/new_8')

In [None]:
model.cpu()
report = evaluate(model, test_loader)
print(report)

In [None]:
model.eval()
inputs = tokenizer("มีเวลาก็หัดศึกษาเองบ้างสิ อีโง่", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

In [None]:
inputs = tokenizer("มีเวลาก็ลองศึกษาด้วยตัวเองดูนะคะ คุณผู้ไม่ฉลาด", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

# something

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)

In [None]:
text_token = tokenizer("สวัสดีค่ะ วันนี้เป็นวันจันทร์", return_tensors="pt")

In [None]:
out = model2(**text_token)

In [None]:
answer_start = torch.argmax(out.start_logits)
answer_end = torch.argmax(out.end_logits) + 1

In [None]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
model3 = AutoModelForCausalLM.from_pretrained(model_name, is_decoder=True)

In [None]:
inputs = tokenizer("วันนี้เป็นวันจันทร์ พรุ่งนี้เป็นวันอะไร", return_tensors="pt")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load pre-trained RoBERTa model and tokenizer
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define input and output texts
input_text = "วันนี้วันจันทร์ พรุ่งนี้เป็นวันอะไร"
output_text = "วันพรุ่งนี้เป็นวันอังคาร"

# Tokenize inputs
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")

# Generate output
generated_ids = model.generate(input_ids=input_ids, decoder_start_token_id=model.config.decoder.pad_token_id, max_length=128, num_beams=4, early_stopping=True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Input text:", input_text)
print("Expected output text:", output_text)
print("Generated output text:", generated_text)
