In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from data_utils import TextClassificationDataset, TextClassificationDataLoader
from forward_sequence import forward_sequence_classification
from metrics import text_classification_metrics_fn

In [3]:
# def seed_everything(seed):
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True

# seed_everything(21092022)

LOAD MODEL FROM INDOBERT

In [4]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = TextClassificationDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Prepare Dataset

In [6]:
train_dataset_path = "dataset/data_worthcheck/train.csv"
dev_dataset_path = "dataset/data_worthcheck/dev.csv"
test_dataset_path = "dataset/data_worthcheck/test.csv"

In [7]:
train_dataset = TextClassificationDataset(train_dataset_path, tokenizer)
dev_dataset = TextClassificationDataset(dev_dataset_path, tokenizer)
test_dataset = TextClassificationDataset(test_dataset_path, tokenizer)

train_loader = TextClassificationDataLoader(train_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=True)
dev_loader = TextClassificationDataLoader(dev_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=False)
test_loader = TextClassificationDataLoader(test_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=False)
# train_dataset.__getitem__(0)

  cpuset_checked))


In [8]:
w2i, i2w = TextClassificationDataset.LABEL2INDEX, TextClassificationDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'no': 0, 'yes': 1}
{0: 'no', 1: 'yes'}


TESTING MODEL ON SENTENCE IN DATASET

In [9]:
text = train_dataset.__getitem__(0)[0] 
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label: {i2w[label]} ({F.softmax(logits, dim=1).squeeze()[label]*100:.2f}%)')

Text: [2, 1709, 22224, 3121, 4684, 8725, 16360, 16811, 4961, 2866, 2140, 3021, 2524, 11777, 25880, 21785, 9244, 3121, 30378, 2866, 741, 844, 2412, 1465, 89, 17754, 15467, 117, 2866, 21785, 448, 1686, 21785, 906, 30378, 2174, 3] | Label: no (56.49%)


In [10]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [11]:
#TRAINING MODEL

def train(model, train_loader, dev_loader, optimizer, device, epochs=5, save_path=None):
    model.to(device)
    model.train()
    best_acc = 0
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        print('-' * 10)
        train_loss = 0
        train_acc = 0
        train_steps = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            loss, _, _ = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_steps += 1
        train_loss /= train_steps
        train_acc = evaluate(model, train_loader, device)
        dev_acc = evaluate(model, dev_loader, device)
        if dev_acc > best_acc:
            best_acc = dev_acc
            if save_path:
                torch.save(model.state_dict(), save_path)
        print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Dev Acc: {dev_acc*100:.2f}%')
        print()

def evaluate(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in data_loader:
            _, y_true_batch, y_pred_batch = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            y_true.extend(y_true_batch)
            y_pred.extend(y_pred_batch)
    model.train()
    return text_classification_metrics_fn(y_pred, y_true)["accuracy"]

def forward_sequence_classification(model, batch, i2w, device):
    (subwords, attention_mask, token_type_ids, labels) = batch
    subwords = torch.IntTensor(subwords).to(device)
    attention_mask = torch.IntTensor(attention_mask).to(device)
    token_type_ids = torch.IntTensor(token_type_ids).to(device)
    labels = torch.LongTensor(labels).to(device)
    logits = model(subwords, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
    loss = F.cross_entropy(logits, labels)
    y_true = labels.detach().cpu().numpy().tolist()
    y_pred = torch.topk(logits, k=1, dim=-1)[1].squeeze().detach().cpu().numpy().tolist()
    y_pred = [i2w[label] for label in y_pred]
    return loss, y_true, y_pred

model_result = train(model, train_loader, dev_loader, optimizer, device="cuda", epochs=5, save_path="model/indobert-base-p1.bin")
model_result

Epoch 1/5
----------


100%|█████████▉| 1350/1351 [04:55<00:00,  4.58it/s]


TypeError: ignored