## Mendefinisikan hyperparameter

In [32]:
import sys

MODEL_NAME = "indolem/indobert-base-uncased"
EPOCH = 1
SAMPLE = 25
# EPOCH = 16
# SAMPLE = sys.maxsize

SEED = 42
BATCH_SIZE = 16
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 1e-5
MAX_LENGTH = 400
STRIDE = 100
LOGGING_STEPS = 50
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01

## Instalasi setiap module yang digunakan

In [33]:
!pip install -r requirements.txt

Collecting nusacrowd@ git+https://github.com/IndoNLP/nusa-crowd.git@7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Cloning https://github.com/IndoNLP/nusa-crowd.git (to revision 7748513d20331e72f9969f94f5d43c7f2d4a59a5) to /tmp/pip-install-88wfvpqe/nusacrowd_e0e75e3eb6574602ab327056ee04cc93
  Running command git clone --filter=blob:none -q https://github.com/IndoNLP/nusa-crowd.git /tmp/pip-install-88wfvpqe/nusacrowd_e0e75e3eb6574602ab327056ee04cc93
  Running command git rev-parse -q --verify 'sha^7748513d20331e72f9969f94f5d43c7f2d4a59a5'
  Running command git fetch -q https://github.com/IndoNLP/nusa-crowd.git 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Running command git checkout -q 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Resolved https://github.com/IndoNLP/nusa-crowd.git to commit 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [34]:
!nvidia-smi

Thu Feb 23 13:18:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   41C    P0    58W / 300W |   1372MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   42C    P0    58W / 300W |   2864MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [35]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

## Import setiap library yang digunakan

In [36]:
import transformers
import evaluate
import torch
import operator
import ast
import json
import re
import sys

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from torch.utils.data import DataLoader
from datetime import datetime
from huggingface_hub import notebook_login

from datasets import (
  load_dataset, 
  load_from_disk,
  Dataset
)
from transformers import (
  BigBirdTokenizerFast,
  BigBirdForSequenceClassification,
  DataCollatorWithPadding,
  TrainingArguments,
  Trainer,
  BertForSequenceClassification,
  BertForQuestionAnswering,
  AutoModel, 
  BertTokenizerFast,
  AutoTokenizer, 
  AutoModel, 
  BertTokenizer, 
  BertForPreTraining,
  AutoModelForSequenceClassification,
  AutoModelForQuestionAnswering
)

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Gunakan tokenizer yang sudah pre-trained

In [38]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--indolem--indobert-base-uncased/snapshots/b6663c19a819c04798e7a93d681f9bc34ed57b4a/config.json
Model config BertConfig {
  "_name_or_path": "indolem/indobert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31923
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--indolem--indobert-base-uncased/snapsho

## Import dataset IndoNLI

In [39]:
data_indonli = load_dataset("indonli")

Reusing dataset indo_nli (/root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)


  0%|          | 0/4 [00:00<?, ?it/s]

## Fungsi utilitas untuk pre-process data IndoNLI

In [40]:
def preprocess_function_indonli(examples, tokenizer, MAX_LENGTH):
    return tokenizer(
        examples['premise'], examples['hypothesis'],
        truncation=True, return_token_type_ids=True,
        max_length=MAX_LENGTH
    )

## Melakukan tokenisasi data IndoNLI

In [41]:
tokenized_data_indonli = data_indonli.map(
    preprocess_function_indonli,
    batched=True,
    load_from_cache_file=True,
    num_proc=1,
    remove_columns=['premise', 'hypothesis'],
    fn_kwargs={'tokenizer': tokenizer, 'MAX_LENGTH': MAX_LENGTH}
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62/cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62/cache-bdd640fb06671ad1.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62/cache-3eb13b9046685257.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62/cache-23b8c1e9392456de.arrow


In [42]:
tokenized_data_indonli.set_format("torch", columns=["input_ids", "token_type_ids"], output_all_columns=True, device=device)

In [43]:
tokenized_data_indonli_train = Dataset.from_dict(tokenized_data_indonli["train"][:SAMPLE])
tokenized_data_indonli_validation = Dataset.from_dict(tokenized_data_indonli["validation"][:SAMPLE])
tokenized_data_indonli_test_lay = Dataset.from_dict(tokenized_data_indonli["test_lay"][:SAMPLE])
tokenized_data_indonli_test_expert = Dataset.from_dict(tokenized_data_indonli["test_expert"][:SAMPLE])

# Tahapan fine-tune IndoNLI diatas IndoBERT

## Fungsi utilitas untuk komputasi metrik

In [44]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions, references=labels)

## Dictionary untuk mapping label

In [45]:
id2label = {0: 'entailment', 1: 'neutral', 
            2: 'contradiction'}
label2id = {'entailment': 0, 'neutral': 
            1, 'contradiction': 2}
accuracy = evaluate.load('accuracy')

## Gunakan model Sequence Classification yang sudah pre-trained

In [46]:
model_sc = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, 
    id2label=id2label, label2id=label2id)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--indolem--indobert-base-uncased/snapshots/b6663c19a819c04798e7a93d681f9bc34ed57b4a/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 3

In [47]:
model_sc = model_sc.to(device)

## Melakukan pengumpulan data dengan padding

In [48]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Mendefinisikan argumen (dataops) untuk training nanti

In [58]:
TIME_NOW = str(datetime.now()).replace(":", "-").replace(" ", "_").replace(".", "_")
NAME = 'IndoNLI-data_train-with_IndoLEM'
SC = f'./results/{NAME}-{TIME_NOW}'

CHECKPOINT_DIR = f'{SC}/checkpoint/'
MODEL_DIR = f'{SC}/model/'
OUTPUT_DIR = f'{SC}/output/'
ACCURACY_DIR = f'{SC}/accuracy/'

REPO_NAME = f'fine-tuned-{NAME}'

In [59]:
training_args_sc = TrainingArguments(
    
    # Checkpoint
    output_dir=CHECKPOINT_DIR,
    overwrite_output_dir=True,
    save_strategy='epoch',
    save_total_limit=EPOCH,
    
    # Log
    report_to='tensorboard',
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=LOGGING_STEPS,
    
    # Train
    num_train_epochs=EPOCH,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    bf16=False,
    dataloader_num_workers=cpu_count(),
    
    # Miscellaneous
    evaluation_strategy='epoch',
    seed=SEED,
    push_to_hub=True,
    hub_model_id=REPO_NAME
)

PyTorch: setting up devices


## Mulai training untuk fine-tune IndoNLI diatas IndoBERT

In [60]:
trainer_sc = Trainer(
    model=model_sc,
    args=training_args_sc,
    train_dataset=tokenized_data_indonli_train,
    eval_dataset=tokenized_data_indonli_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/muhammadravi251001/fine-tuned-IndoNLI-data_train-with_IndoLEM into local empty directory.


Download file pytorch_model.bin:   0%|          | 7.37k/422M [00:00<?, ?B/s]

Download file runs/Feb22_14-46-36_muhammad-ravi-tensorrt-pod/1677077220.4803088/events.out.tfevents.1677077220…

Download file runs/Feb23_10-02-03_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677146596.muhammad-ravi-tens…

Download file runs/Feb22_14-46-36_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677077220.muhammad-ravi-tens…

Download file runs/Feb23_10-02-03_muhammad-ravi-tensorrt-pod/1677146596.596313/events.out.tfevents.1677146596.…

Clean file runs/Feb22_14-46-36_muhammad-ravi-tensorrt-pod/1677077220.4803088/events.out.tfevents.1677077220.mu…

Clean file runs/Feb23_10-02-03_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677146596.muhammad-ravi-tensorr…

Clean file runs/Feb22_14-46-36_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677077220.muhammad-ravi-tensorr…

Clean file runs/Feb23_10-02-03_muhammad-ravi-tensorrt-pod/1677146596.596313/events.out.tfevents.1677146596.muh…

Download file training_args.bin: 100%|##########| 3.62k/3.62k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##7       | 1.00k/3.62k [00:00<?, ?B/s]

Download file runs/Feb23_13-18-53_muhammad-ravi-tensorrt-pod/1677158397.3992813/events.out.tfevents.1677158397…

Clean file runs/Feb23_13-18-53_muhammad-ravi-tensorrt-pod/1677158397.3992813/events.out.tfevents.1677158397.mu…

Download file runs/Feb23_13-18-53_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677158397.muhammad-ravi-tens…

Clean file runs/Feb23_13-18-53_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677158397.muhammad-ravi-tensorr…

Clean file pytorch_model.bin:   0%|          | 1.00k/422M [00:00<?, ?B/s]

In [61]:
trainer_sc.train()

***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 1
  Number of trainable parameters = 110560515


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5588,1.23974,0.08


***** Running Evaluation *****
  Num examples = 25
  Batch size = 8
Saving model checkpoint to ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/checkpoint-1
Configuration saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/checkpoint-1/config.json
Model weights saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/checkpoint-1/pytorch_model.bin
tokenizer config file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/checkpoint-1/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/checkpoint-1/special_tokens_map.json
tokenizer config file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/special_tokens_m

TrainOutput(global_step=1, training_loss=0.5588150024414062, metrics={'train_runtime': 28.448, 'train_samples_per_second': 0.879, 'train_steps_per_second': 0.035, 'total_flos': 956869499628.0, 'train_loss': 0.5588150024414062, 'epoch': 1.0})

## Simpan model Sequence Classification

In [62]:
trainer_sc.save_model(MODEL_DIR)

Saving model checkpoint to ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/model/
Configuration saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/model/config.json
Model weights saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/model/pytorch_model.bin
tokenizer config file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/model/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/model/special_tokens_map.json
Saving model checkpoint to ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/
Configuration saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/config.json
Model weights saved in ./results/IndoNLI-data_train-with_IndoLEM-2023-02-23_13-28-04_584765/checkpoint/pytorch_model.bin
tokenizer config file saved in ./results/IndoNLI-data_train-with_IndoLE

Upload file runs/Feb23_13-28-05_muhammad-ravi-tensorrt-pod/1677158967.0945685/events.out.tfevents.1677158967.m…

Upload file runs/Feb23_13-28-05_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677158967.muhammad-ravi-tensor…

Upload file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/muhammadravi251001/fine-tuned-IndoNLI-data_train-with_IndoLEM
   1b50ef4..062bcd9  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.08}]}


# Melakukan prediksi dari model

In [54]:
predict_result = trainer_sc.predict(tokenized_data_indonli_validation)

***** Running Prediction *****
  Num examples = 25
  Batch size = 8


In [65]:
os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
with open(f'{OUTPUT_DIR}/output.txt', "w") as f:
  f.write(str(predict_result))
  f.close()

# Melakukan evaluasi dari prediksi

In [56]:
def compute_accuracy(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions, references=labels)

In [57]:
accuracy_result = compute_accuracy(predict_result)

In [64]:
os.makedirs(os.path.dirname(ACCURACY_DIR), exist_ok=True)
with open(f'{ACCURACY_DIR}/accuracy.txt', "w") as f:
  f.write(str(accuracy_result))
  f.close()