In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_path = 'data/'

### first label

In [4]:
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

In [5]:
train['num_labels'] = train['labels'].apply(lambda x: 1 if x == 'สุภาพ' else 0)
test['num_labels'] = test['labels'].apply(lambda x: 1 if x == 'สุภาพ' else 0)

### second label

In [2]:
from sklearn.model_selection import train_test_split

# data = pd.read_csv('data_path + label.csv')
# data.drop_duplicates(subset=['text'], inplace=True)
# data.drop(data[data['label']== 2].index, inplace=True)
# data.to_csv('data_path + label_clean_with_index.csv')
# data['label'].value_counts()

1.0    12449
0.0     7923
Name: label, dtype: int64

### sample 4000 data

In [29]:
# polite_data = data[data['label']== 1]
# impolite_data = data[data['label']== 0]

# sample_polite = polite_data.sample(n=4000)
# sample_impolite = impolite_data.sample(n=4000)

# sample_polite.to_csv('data_path + sample_polite.csv')
# sample_impolite.to_csv('data_path + sample_impolite.csv')

### load data

In [3]:
polite_data = pd.read_csv('sample_polite.csv').drop(columns=['Unnamed: 0'])
impolite_data = pd.read_csv('sample_impolite.csv').drop(columns=['Unnamed: 0'])

data = pd.concat([polite_data, impolite_data])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['text'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42, stratify=data['label'])

### Dataset

In [5]:
from torch.utils.data import Dataset, DataLoader

class PoliteDataset(Dataset):
    def __init__(self, X, y):
        self.text = X
        self.targets = y

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        target = int(self.targets[idx])

        return text, target

In [6]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.to(device)

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(25005, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0): CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
     

### first dataloaders

In [11]:
polite_train_dataset = PoliteDataset(train['text'], train['num_labels'])
train_loader = DataLoader(polite_train_dataset, batch_size=8, shuffle=True)
polite_test_dataset = PoliteDataset(test['text'], test['num_labels'])
test_loader = DataLoader(polite_test_dataset, batch_size=8, shuffle=True)

### second dataloaders

In [7]:
polite_train_dataset = PoliteDataset(X_train, y_train)
train_loader = DataLoader(polite_train_dataset, batch_size=8, shuffle=True)
polite_test_dataset = PoliteDataset(X_test, y_test)
test_loader = DataLoader(polite_test_dataset, batch_size=8, shuffle=True)

### train

In [8]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)



In [9]:
from tqdm import tqdm

epochs = 10

for epoch in range(epochs):
    train_acc = 0
    print(f'Epoch {epoch}')
    for batch in tqdm(train_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_acc += (output.logits.argmax(dim=1) == target).sum()
    model.eval()
    test_acc = 0
    for batch in tqdm(test_loader):
        text, target = batch
        encoding = tokenizer(list(text), return_tensors='pt', padding=True, truncation=True, max_length=256)
        encoding = encoding.to(device)
        target = target.to(device)
        output = model(**encoding, labels=target)
        loss = output.loss
        test_acc += (output.logits.argmax(dim=1) == target).sum()
    model.save_pretrained(f'./checkpoints/classifier/new_{epoch}')
    print(f'Epoch {epoch} loss: {loss.item()} train accuracy: {train_acc/len(polite_train_dataset)} test accuracy: {test_acc/len(polite_test_dataset)}')

Epoch 0


100%|██████████| 800/800 [01:33<00:00,  8.57it/s]
100%|██████████| 200/200 [00:06<00:00, 31.50it/s]


Epoch 0 loss: 0.4007788896560669 train accuracy: 0.8709374666213989 test accuracy: 0.887499988079071
Epoch 1


100%|██████████| 800/800 [01:35<00:00,  8.38it/s]
100%|██████████| 200/200 [00:06<00:00, 32.82it/s]


Epoch 1 loss: 0.5014211535453796 train accuracy: 0.9142187237739563 test accuracy: 0.8956249952316284
Epoch 2


100%|██████████| 800/800 [01:44<00:00,  7.66it/s]
100%|██████████| 200/200 [00:06<00:00, 32.97it/s]


Epoch 2 loss: 0.0621986985206604 train accuracy: 0.9437499642372131 test accuracy: 0.8999999761581421
Epoch 3


100%|██████████| 800/800 [01:44<00:00,  7.67it/s]
100%|██████████| 200/200 [00:07<00:00, 27.07it/s]


Epoch 3 loss: 0.0052405293099582195 train accuracy: 0.9718749523162842 test accuracy: 0.9018749594688416
Epoch 4


100%|██████████| 800/800 [01:48<00:00,  7.38it/s]
100%|██████████| 200/200 [00:06<00:00, 31.25it/s]


Epoch 4 loss: 0.46858322620391846 train accuracy: 0.97718745470047 test accuracy: 0.9081249833106995
Epoch 5


100%|██████████| 800/800 [01:46<00:00,  7.50it/s]
100%|██████████| 200/200 [00:06<00:00, 31.34it/s]


Epoch 5 loss: 0.002541386289522052 train accuracy: 0.9885937571525574 test accuracy: 0.9118749499320984
Epoch 6


100%|██████████| 800/800 [01:40<00:00,  7.96it/s]
100%|██████████| 200/200 [00:07<00:00, 28.27it/s]


Epoch 6 loss: 0.0012215058086439967 train accuracy: 0.9792187213897705 test accuracy: 0.8918749690055847
Epoch 7


100%|██████████| 800/800 [01:35<00:00,  8.40it/s]
100%|██████████| 200/200 [00:06<00:00, 33.03it/s]


Epoch 7 loss: 0.8616598844528198 train accuracy: 0.9870312213897705 test accuracy: 0.9118749499320984
Epoch 8


100%|██████████| 800/800 [01:36<00:00,  8.30it/s]
100%|██████████| 200/200 [00:05<00:00, 33.82it/s]


Epoch 8 loss: 1.3001759052276611 train accuracy: 0.9940624833106995 test accuracy: 0.9118749499320984
Epoch 9


100%|██████████| 800/800 [01:43<00:00,  7.71it/s]
100%|██████████| 200/200 [00:06<00:00, 30.38it/s]


Epoch 9 loss: 2.307856559753418 train accuracy: 0.9926562309265137 test accuracy: 0.9118749499320984


In [10]:
model = AutoModelForSequenceClassification.from_pretrained('./checkpoints/classifier/new_5')

In [11]:
model.eval()
inputs = tokenizer("มีเวลาก็หัดศึกษาเองบ้างสิ อีโง่", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

tensor([0])


In [12]:
inputs = tokenizer("มีเวลาก็ลองศึกษาด้วยตัวเองดูนะคะ คุณผู้ไม่ฉลาด", return_tensors="pt")
output = model(**inputs)
print(output.logits.argmax(dim=1))

tensor([1])


# something

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)

In [50]:
text_token = tokenizer("สวัสดีค่ะ วันนี้เป็นวันจันทร์", return_tensors="pt")

In [62]:
out = model2(**text_token)

In [65]:
answer_start = torch.argmax(out.start_logits)
answer_end = torch.argmax(out.end_logits) + 1

In [59]:
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

TypeError: argmax(): argument 'input' (position 1) must be Tensor, not str

In [76]:
from transformers import AutoModelForCausalLM

In [81]:
model3 = AutoModelForCausalLM.from_pretrained(model_name, is_decoder=True)

In [82]:
inputs = tokenizer("วันนี้เป็นวันจันทร์ พรุ่งนี้เป็นวันอะไร", return_tensors="pt")

In [95]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load pre-trained RoBERTa model and tokenizer
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define input and output texts
input_text = "วันนี้วันจันทร์ พรุ่งนี้เป็นวันอะไร"
output_text = "วันพรุ่งนี้เป็นวันอังคาร"

# Tokenize inputs
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = tokenizer.encode(output_text, return_tensors="pt")

# Generate output
generated_ids = model.generate(input_ids=input_ids, decoder_start_token_id=model.config.decoder.pad_token_id, max_length=128, num_beams=4, early_stopping=True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Input text:", input_text)
print("Expected output text:", output_text)
print("Generated output text:", generated_text)


If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`


AttributeError: 'CamembertConfig' object has no attribute 'decoder'