In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from data_utils import TextClassificationDataset, TextClassificationDataLoader
from metrics import text_classification_metrics_fn

LOAD MODEL FROM INDOBERT

In [3]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = TextClassificationDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [5]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (50000, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

Prepare Dataset

In [6]:
train_dataset_path = "dataset/data_worthcheck/train.csv"
dev_dataset_path = "dataset/data_worthcheck/dev.csv"
test_dataset_path = "dataset/data_worthcheck/test.csv"

In [7]:
train_dataset = TextClassificationDataset(train_dataset_path, tokenizer)
dev_dataset = TextClassificationDataset(dev_dataset_path, tokenizer)
test_dataset = TextClassificationDataset(test_dataset_path, tokenizer)

train_loader = TextClassificationDataLoader(train_dataset, max_len=512, batch_size=16, num_workers=2, shuffle=True)
dev_loader = TextClassificationDataLoader(dev_dataset, max_len=512, batch_size=16, num_workers=2, shuffle=False)
test_loader = TextClassificationDataLoader(test_dataset, max_len=512, batch_size=16, num_workers=2, shuffle=False)

In [8]:
w2i, i2w = TextClassificationDataset.LABEL2INDEX, TextClassificationDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'no': 0, 'yes': 1}
{0: 'no', 1: 'yes'}


TESTING MODEL ON SENTENCE IN DATASET

In [9]:
text = train_dataset.__getitem__(3)[2] 
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label: {i2w[label]} ({F.softmax(logits, dim=1).squeeze()[label]*100:.2f}%)')

Text: neng solo wes ono terduga corona cobo neng ati mu neng conora | Label: yes (68.77%)


In [10]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [11]:
#TRAINING MODEL

def train(model, train_loader, dev_loader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    best_acc = 0
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        print('-' * 10)
        train_loss = 0
        train_acc = 0
        train_steps = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            loss, _, _ = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_steps += 1
        train_loss /= train_steps
        train_metrics = evaluate(model, train_loader, device)
        dev_metrics = evaluate(model, dev_loader, device)
        train_acc = train_metrics['accuracy']
        train_f1 = train_metrics['f1']
        train_precision = train_metrics['precision']
        train_recall = train_metrics['recall']
        print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train F1: {train_f1*100:.2f}% | Train Precision: {train_precision*100:.2f}% | Train Recall: {train_recall*100:.2f}%')
      
def evaluate(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in data_loader:
            _, y_true_batch, y_pred_batch = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
            y_true.extend(y_true_batch)
            y_pred.extend(y_pred_batch)
    # model.train()
    return text_classification_metrics_fn(y_pred, y_true)

def forward_sequence_classification(model, batch, i2w, device):
    input_ids, attention_mask, labels = batch
    input_ids = torch.IntTensor(input_ids).to(device)
    attention_mask = torch.IntTensor(attention_mask).to(device)
    labels = torch.LongTensor(labels).to(device)

    output = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss, logits = output[:2]
    
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, k=1, dim=-1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[labels[j][0].item()])
    
    return loss, list_label, list_hyp

model_result = train(model, train_loader, dev_loader, optimizer, device="cuda", epochs=5)
model_result

Epoch 1/5
----------


100%|██████████| 1351/1351 [04:45<00:00,  4.74it/s]


Train Loss: 0.300 | Train Acc: 92.31% | Train F1: 90.47% | Train Precision: 90.61% | Train Recall: 90.34%
Epoch 2/5
----------


100%|██████████| 1351/1351 [04:42<00:00,  4.78it/s]


Train Loss: 0.200 | Train Acc: 95.29% | Train F1: 94.10% | Train Precision: 94.76% | Train Recall: 93.50%
Epoch 3/5
----------


100%|██████████| 1351/1351 [04:43<00:00,  4.77it/s]


Train Loss: 0.133 | Train Acc: 97.59% | Train F1: 97.06% | Train Precision: 96.54% | Train Recall: 97.63%
Epoch 4/5
----------


100%|██████████| 1351/1351 [04:44<00:00,  4.75it/s]


Train Loss: 0.085 | Train Acc: 98.50% | Train F1: 98.15% | Train Precision: 97.97% | Train Recall: 98.33%
Epoch 5/5
----------


100%|██████████| 1351/1351 [04:43<00:00,  4.76it/s]


Train Loss: 0.061 | Train Acc: 98.56% | Train F1: 98.23% | Train Precision: 97.77% | Train Recall: 98.73%


In [12]:
#Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader)
for batch in pbar:
    loss, y_true, y_pred = forward_sequence_classification(model, batch[:-1], i2w=i2w, device="cuda")
    total_loss += loss.item()
    list_hyp.extend(y_pred)
    list_label.extend(y_true)

#Save Prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('result.txt', index=False)
print(df)

100%|██████████| 175/175 [00:10<00:00, 17.10it/s]

      index label
0         0    no
1         1    no
2         2    no
3         3    no
4         4    no
...     ...   ...
2795   2795    no
2796   2796    no
2797   2797   yes
2798   2798    no
2799   2799   yes

[2800 rows x 2 columns]





Test fine-tuned model on sample sentences using Test Sample

In [13]:
#Get metrics for test data
test_metrics = text_classification_metrics_fn(list_hyp, list_label)

print(f'Test Loss: {total_loss/len(test_loader):.3f} | Test Acc: {test_metrics["accuracy"]*100:.2f}% | Test F1: {test_metrics["f1"]*100:.2f}% | Test Precision: {test_metrics["precision"]*100:.2f}% | Test Recall: {test_metrics["recall"]*100:.2f}%')

Test Loss: 0.400 | Test Acc: 86.32% | Test F1: 82.92% | Test Precision: 81.45% | Test Recall: 85.09%


In [14]:
text = test_dataset.__getitem__(2796)[2]
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: guru hati hati ya virus corona uda indonesia temen yauda liburin aja bu sekolah nya guru loh ngaruh kbm temen kalo diliburin rumah doang risiko tertular virus orang berkurang bu mendingan diliburin | Label : no (85.076%)


Test fine-tuned model on sample sentences using Dev Sample

In [15]:
text = dev_dataset.__getitem__(0)[2]
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: jek dajal ga depok bang | Label : no (99.937%)
