## Setting up

In [1]:
%env WANDB_PROJECT=asr-project-nlp-part

env: WANDB_PROJECT=asr-project-nlp-part


In [2]:
# !pip install transformers datasets huggingface_hub evaluate wandb pythainlp

In [3]:
# !wandb login

In [4]:
# from huggingface_hub import notebook_login
# notebook_login()

In [5]:
seed = 42 # The answer to everything

import torch
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

import random
random.seed(seed)

import numpy as np
np.random.seed(seed)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

import transformers
transformers.enable_full_determinism(seed)

In [6]:
import re
import pandas as pd

import torchtext

from datasets import load_dataset

import evaluate

In [35]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

## Datasets

In [31]:
model_checkpoint = 'airesearch/wangchanberta-base-att-spm-uncased'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    revision='main',
    model_max_length=416
    )

tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']

Downloading (…)okenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [24]:
# Placeholder
ds = load_dataset("imdb", split='train')



In [26]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Model

In [28]:
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [29]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {j:i for i,j in id2label.items()}

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id,
)

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

In [36]:
training_args = TrainingArguments(
    output_dir="asr-project-nlp-part",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    push_to_hub=False,
)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds, # This is wrong should be change to the one below
    # train_dataset=tokenized_ds["train"],
    # eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnoppakorn[0m ([33mmeen[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
