In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
data_path = 'data/classification_data/'

### load data

In [27]:
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

### Dataset

In [28]:
from torch.utils.data import Dataset, DataLoader

class PoliteDataset(Dataset):
    def __init__(self, X, y):
        self.text = X
        self.targets = y

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        target = int(self.targets[idx])

        return text, target

In [29]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.to(device)

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(25005, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

### second dataloaders

In [31]:
polite_train_dataset = PoliteDataset(train['text'].tolist(), train['label'].tolist())
train_loader = DataLoader(polite_train_dataset, batch_size=8, shuffle=True)
polite_test_dataset = PoliteDataset(test['text'].tolist(), test['label'].tolist())
test_loader = DataLoader(polite_test_dataset, batch_size=8, shuffle=True)

### train

In [30]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)



In [26]:
from tqdm import tqdm

epochs = 10

for epoch in range(epochs):
    train_acc = 0
    print(f'Epoch {epoch}')
    for batch in tqdm(train_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_acc += (output.logits.argmax(dim=1) == target).sum()
    model.eval()
    test_acc = 0
    for batch in tqdm(test_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        test_acc += (output.logits.argmax(dim=1) == target).sum()
    model.save_pretrained(f'./checkpoints/classifier/new_{epoch}')
    print(f'Epoch {epoch} loss: {loss.item()} train accuracy: {train_acc/len(polite_train_dataset)} test accuracy: {test_acc/len(polite_test_dataset)}')

Epoch 0


100%|██████████| 800/800 [01:28<00:00,  9.06it/s]
100%|██████████| 200/200 [00:06<00:00, 28.63it/s]


Epoch 0 loss: 0.057998351752758026 train accuracy: 0.8840624690055847 test accuracy: 0.9006249904632568
Epoch 1


100%|██████████| 800/800 [01:30<00:00,  8.84it/s]
100%|██████████| 200/200 [00:08<00:00, 24.32it/s]


Epoch 1 loss: 0.18015776574611664 train accuracy: 0.9267187118530273 test accuracy: 0.9118749499320984
Epoch 2


100%|██████████| 800/800 [01:35<00:00,  8.35it/s]
100%|██████████| 200/200 [00:08<00:00, 24.64it/s]


Epoch 2 loss: 0.14777548611164093 train accuracy: 0.9614062309265137 test accuracy: 0.9131249785423279
Epoch 3


100%|██████████| 800/800 [01:41<00:00,  7.85it/s]
100%|██████████| 200/200 [00:08<00:00, 24.04it/s]


Epoch 3 loss: 0.6844654083251953 train accuracy: 0.9764062166213989 test accuracy: 0.90625
Epoch 4


100%|██████████| 800/800 [01:40<00:00,  7.94it/s]
100%|██████████| 200/200 [00:08<00:00, 23.88it/s]


Epoch 4 loss: 0.0003045824996661395 train accuracy: 0.9640624523162842 test accuracy: 0.9099999666213989
Epoch 5


100%|██████████| 800/800 [01:44<00:00,  7.64it/s]
100%|██████████| 200/200 [00:08<00:00, 24.56it/s]


Epoch 5 loss: 0.3572331368923187 train accuracy: 0.9762499928474426 test accuracy: 0.9024999737739563
Epoch 6


100%|██████████| 800/800 [01:47<00:00,  7.46it/s]
100%|██████████| 200/200 [00:08<00:00, 24.00it/s]


Epoch 6 loss: 1.0296587944030762 train accuracy: 0.9760937094688416 test accuracy: 0.90687495470047
Epoch 7


100%|██████████| 800/800 [01:47<00:00,  7.45it/s]
100%|██████████| 200/200 [00:08<00:00, 23.79it/s]


Epoch 7 loss: 0.2748199701309204 train accuracy: 0.9842187166213989 test accuracy: 0.9024999737739563
Epoch 8


100%|██████████| 800/800 [01:47<00:00,  7.43it/s]
100%|██████████| 200/200 [00:08<00:00, 24.32it/s]


Epoch 8 loss: 0.008516130037605762 train accuracy: 0.9710937142372131 test accuracy: 0.90625
Epoch 9


100%|██████████| 800/800 [01:47<00:00,  7.45it/s]
100%|██████████| 200/200 [00:08<00:00, 23.69it/s]


Epoch 9 loss: 0.0021716293413192034 train accuracy: 0.9876562356948853 test accuracy: 0.9143750071525574


In [27]:
model = AutoModelForSequenceClassification.from_pretrained('./checkpoints/classifier/new_5')

In [28]:
model.eval()
inputs = tokenizer("มีเวลาก็หัดศึกษาเองบ้างสิ อีโง่", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

tensor([0])


In [29]:
inputs = tokenizer("มีเวลาก็ลองศึกษาด้วยตัวเองดูนะคะ คุณผู้ไม่ฉลาด", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

tensor([1])


# something

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)

In [31]:
text_token = tokenizer("สวัสดีค่ะ วันนี้เป็นวันจันทร์", return_tensors="pt")

In [None]:
out = model2(**text_token)

In [65]:
answer_start = torch.argmax(out.start_logits)
answer_end = torch.argmax(out.end_logits) + 1

In [None]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

In [76]:
from transformers import AutoModelForCausalLM

In [81]:
model3 = AutoModelForCausalLM.from_pretrained(model_name, is_decoder=True)

In [82]:
inputs = tokenizer("วันนี้เป็นวันจันทร์ พรุ่งนี้เป็นวันอะไร", return_tensors="pt")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load pre-trained RoBERTa model and tokenizer
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define input and output texts
input_text = "วันนี้วันจันทร์ พรุ่งนี้เป็นวันอะไร"
output_text = "วันพรุ่งนี้เป็นวันอังคาร"

# Tokenize inputs
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")

# Generate output
generated_ids = model.generate(input_ids=input_ids, decoder_start_token_id=model.config.decoder.pad_token_id, max_length=128, num_beams=4, early_stopping=True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Input text:", input_text)
print("Expected output text:", output_text)
print("Generated output text:", generated_text)
