In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from data_utils import TextClassificationDataset, TextClassificationDataLoader
from forward_sequence import forward_sequence_classification
from metrics import text_classification_metrics_fn

In [3]:
# def seed_everything(seed):
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True

# seed_everything(21092022)

LOAD MODEL FROM INDOBERT

In [4]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = TextClassificationDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Prepare Dataset

In [6]:
train_dataset_path = "dataset/data_worthcheck/train.csv"
dev_dataset_path = "dataset/data_worthcheck/dev.csv"
test_dataset_path = "dataset/data_worthcheck/test.csv"

In [7]:
train_dataset = TextClassificationDataset(train_dataset_path, tokenizer)
dev_dataset = TextClassificationDataset(dev_dataset_path, tokenizer)
test_dataset = TextClassificationDataset(test_dataset_path, tokenizer)

train_loader = TextClassificationDataLoader(train_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=True)
dev_loader = TextClassificationDataLoader(dev_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=False)
test_loader = TextClassificationDataLoader(test_dataset, max_len=512, batch_size=16, num_workers=16, shuffle=False)
# train_dataset.__getitem__(0)

  cpuset_checked))


In [8]:
w2i, i2w = TextClassificationDataset.LABEL2INDEX, TextClassificationDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'no': 0, 'yes': 1}
{0: 'no', 1: 'yes'}


TESTING MODEL ON SENTENCE IN DATASET

In [18]:
text = train_dataset.__getitem__(3)[2] 
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label: {i2w[label]} ({F.softmax(logits, dim=1).squeeze()[label]*100:.2f}%)')

Text: neng solo wes ono terduga corona cobo neng ati mu neng conora | Label: no (99.75%)


In [10]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [21]:
#TRAINING MODEL

def train(model, train_loader, dev_loader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    best_acc = 0
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        print('-' * 10)
        train_loss = 0
        train_acc = 0
        train_steps = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            loss, _, _ = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_steps += 1
        train_loss /= train_steps
        train_acc = evaluate(model, train_loader, device)
        dev_acc = evaluate(model, dev_loader, device)
        if dev_acc > best_acc:
            best_acc = dev_acc
        print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Dev Acc: {dev_acc*100:.2f}%')
        print()

def evaluate(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in data_loader:
            _, y_true_batch, y_pred_batch = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            y_true.extend(y_true_batch)
            y_pred.extend(y_pred_batch)
    model.train()
    return text_classification_metrics_fn(y_pred, y_true)["accuracy"]

def forward_sequence_classification(model, batch, i2w, device):
    input_ids, attention_mask, labels = batch
    input_ids = torch.IntTensor(input_ids).to(device)
    attention_mask = torch.IntTensor(attention_mask).to(device)
    labels = torch.LongTensor(labels).to(device)

    output = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss, logits = output[:2]
    
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, k=1, dim=-1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[labels[j][0].item()])
    
    return loss, list_label, list_hyp

model_result = train(model, train_loader, dev_loader, optimizer, device="cuda", epochs=5)
model_result

Epoch 1/5
----------


  cpuset_checked))
100%|██████████| 1351/1351 [04:39<00:00,  4.83it/s]
  cpuset_checked))


Train Loss: 0.1163 | Train Acc: 97.56% | Dev Acc: 86.50%

Epoch 2/5
----------


  cpuset_checked))
100%|██████████| 1351/1351 [04:40<00:00,  4.81it/s]
  cpuset_checked))


Train Loss: 0.0894 | Train Acc: 98.17% | Dev Acc: 86.86%

Epoch 3/5
----------


  cpuset_checked))
100%|██████████| 1351/1351 [04:39<00:00,  4.84it/s]
  cpuset_checked))


Train Loss: 0.0699 | Train Acc: 98.27% | Dev Acc: 85.46%

Epoch 4/5
----------


  cpuset_checked))
100%|██████████| 1351/1351 [04:40<00:00,  4.82it/s]
  cpuset_checked))


Train Loss: 0.0593 | Train Acc: 98.83% | Dev Acc: 87.14%

Epoch 5/5
----------


  cpuset_checked))
100%|██████████| 1351/1351 [04:40<00:00,  4.82it/s]
  cpuset_checked))


Train Loss: 0.0520 | Train Acc: 98.63% | Dev Acc: 88.36%



In [24]:
#Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader)
for batch in pbar:
    loss, y_true, y_pred = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
    total_loss += loss.item()
    list_hyp.extend(y_pred)
    list_label.extend(y_true)

#Save Prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('result.txt', index=False)
print(df)

100%|██████████| 175/175 [00:10<00:00, 16.20it/s]

      index label
0         0    no
1         1    no
2         2    no
3         3    no
4         4   yes
...     ...   ...
2795   2795    no
2796   2796    no
2797   2797   yes
2798   2798    no
2799   2799   yes

[2800 rows x 2 columns]





Test fine-tuned model on sample sentences using Test Sample

In [36]:
text = test_dataset.__getitem__(4)[2]
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: doakan indonesia selamat virus corona pkb depok gelar nusantara bershalawat | Label : yes (93.424%)


Test fine-tuned model on sample sentences using Dev Sample

In [35]:
text = dev_dataset.__getitem__(0)[2]
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: jek dajal ga depok bang | Label : no (99.975%)
