In [1]:
import os.path
import wget

if (os.path.exists('./dev.jsonl') == False):
    wget.download("https://huggingface.co/datasets/muhammadravi251001/translated-indo-nli/raw/main/dev.jsonl")
    print(" Selesai download dev.jsonl")
else: print("File dev.jsonl sudah ada")

if (os.path.exists('./train.jsonl') == False):
    wget.download("https://huggingface.co/datasets/muhammadravi251001/translated-indo-nli/resolve/main/train.jsonl")
    print(" Selesai download train.jsonl")
else: print("File train.jsonl sudah ada")

if (os.path.exists('./dev_augmented.jsonl') == False):
    wget.download("https://huggingface.co/datasets/muhammadravi251001/augmented-indo-nli/raw/main/dev_augmented.jsonl")
    print(" Selesai download dev_augmented.jsonl")
else: print("File dev_augmented.jsonl sudah ada")

if (os.path.exists('./train_augmented.jsonl') == False):
    wget.download("https://huggingface.co/datasets/muhammadravi251001/augmented-indo-nli/resolve/main/train_augmented.jsonl")
    print(" Selesai download train_augmented.jsonl")
else: print("File train_augmented.jsonl sudah ada")

## Mendefinisikan hyperparameter

In [2]:
import sys

MODEL_NAME = "indolem/indobert-base-uncased"
EPOCH = 1
SAMPLE = 25
# EPOCH = 16
# SAMPLE = sys.maxsize

SEED = 42
BATCH_SIZE = 16
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 1e-5
MAX_LENGTH = 400
STRIDE = 100
LOGGING_STEPS = 50
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01

## Instalasi setiap module yang digunakan

In [3]:
!pip install -r requirements.txt

Collecting nusacrowd@ git+https://github.com/IndoNLP/nusa-crowd.git@7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Cloning https://github.com/IndoNLP/nusa-crowd.git (to revision 7748513d20331e72f9969f94f5d43c7f2d4a59a5) to /tmp/pip-install-n744k29s/nusacrowd_932572adbde84642a30acfbc97642977
  Running command git clone --filter=blob:none -q https://github.com/IndoNLP/nusa-crowd.git /tmp/pip-install-n744k29s/nusacrowd_932572adbde84642a30acfbc97642977
  Running command git rev-parse -q --verify 'sha^7748513d20331e72f9969f94f5d43c7f2d4a59a5'
  Running command git fetch -q https://github.com/IndoNLP/nusa-crowd.git 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Running command git checkout -q 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Resolved https://github.com/IndoNLP/nusa-crowd.git to commit 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
!nvidia-smi

Thu Feb 23 14:09:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   37C    P0    57W / 300W |   1372MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   36C    P0    42W / 300W |     11MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

## Import setiap library yang digunakan

In [6]:
import transformers
import evaluate
import torch
import operator
import ast
import json
import re
import sys

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from torch.utils.data import DataLoader
from datetime import datetime
from huggingface_hub import notebook_login

from datasets import (
  load_dataset, 
  load_from_disk,
  Dataset,
  DatasetDict
)
from transformers import (
  BigBirdTokenizerFast,
  BigBirdForSequenceClassification,
  DataCollatorWithPadding,
  TrainingArguments,
  Trainer,
  BertForSequenceClassification,
  BertForQuestionAnswering,
  AutoModel, 
  BertTokenizerFast,
  AutoTokenizer, 
  AutoModel, 
  BertTokenizer, 
  BertForPreTraining,
  AutoModelForSequenceClassification,
  AutoModelForQuestionAnswering
)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Gunakan tokenizer yang sudah pre-trained

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Import dataset IndoNLI

In [9]:
data_train = pd.read_json(path_or_buf='train.jsonl', lines=True)
data_train = data_train[['sentence1', 'sentence2', 'gold_label']]
data_train = data_train.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis', 'gold_label': 'label'})

data_train['label'] = data_train['label'].replace(['entailment'], 0)
data_train['label'] = data_train['label'].replace(['contradiction'], 1)
data_train['label'] = data_train['label'].replace(['neutral'], 2)

data_train

Unnamed: 0,premise,hypothesis,label
0,secara konsep krim skimming memiliki dua dimen...,produk dan geografi adalah apa yang membuat cr...,2
1,anda tahu selama musim dan saya kira di tingka...,anda kehilangan hal-hal ke tingkat berikut jik...,0
2,salah satu nomor kami akan menjalankan instruk...,seorang anggota timku akan melaksanakan perint...,0
3,"bagaimana kau tahu, semua ini adalah informasi...",informasi ini milik mereka.,0
4,ya saya katakan bagaimana meskipun jika anda p...,sepatu tenis memiliki kisaran harga.,2
...,...,...,...
392697,"jelas, california bisa - dan harus - melakukan...",california tidak bisa melakukan yang lebih baik.,1
392698,pernah dianggap sebagai jalan terindah di erop...,banyak bangunan asli yang telah digantikan ole...,2
392699,perahu rumah adalah tradisi yang terpelihara d...,tradisi perahu rumah berasal saat raj inggris ...,0
392700,obituaries fondly recall his on-air debates an...,obituari-obituari itu indah dan ditulis dengan...,2


In [10]:
data_validation = pd.read_json(path_or_buf='dev.jsonl', lines=True)
data_validation = data_validation[['sentence1', 'sentence2', 'gold_label']]
data_validation = data_validation.rename(columns={'sentence1': 'premise', 'sentence2': 'hypothesis', 'gold_label': 'label'})

data_validation['label'] = data_validation['label'].replace(['entailment'], 0)
data_validation['label'] = data_validation['label'].replace(['contradiction'], 1)
data_validation['label'] = data_validation['label'].replace(['neutral'], 2)

data_validation

Unnamed: 0,premise,hypothesis,label
0,hak-hak baru cukup bagus,semua orang benar-benar menyukai manfaat terbaru,2
1,situs ini mencakup daftar semua pemenang pengh...,artikel eksekutif pemerintah yang disimpan di ...,1
2,eh aku tidak tahu aku punya emosi campur aduk ...,"aku menyukainya untuk sebagian besar, tapi mas...",0
3,ya saya pikir restoran favorit saya selalu men...,restoran favorit saya selalu setidaknya seratu...,1
4,aku tidak tahu um apakah anda melakukan banyak...,aku tahu persis.,1
...,...,...,...
19995,apakah anda menonton itu?,bisa kau lihat?,1
19996,"bagi telinga barat, sifat-sifat bahasa yang pa...","bagi telinga barat, sifat bahasa yang paling t...",1
19997,"pencatat menangkap suara tiup keras, tabrakan,...",pencatat tidak menangkap suara apapun.,1
19998,itu sikap yang baik!,"anda merasa baik tentang hal ini, bukan?",2


In [11]:
data_train = data_train[data_train.label != '-']
data_validation = data_validation[data_validation.label != '-']

train_dataset = Dataset.from_dict(data_train)
validation_dataset = Dataset.from_dict(data_validation)

data_indonli_translated = DatasetDict({"train": train_dataset, "validation": validation_dataset})

In [12]:
data_indonli = data_indonli_translated

## Fungsi utilitas untuk pre-process data IndoNLI

In [13]:
def preprocess_function_indonli(examples, tokenizer, MAX_LENGTH):
    return tokenizer(
        examples['premise'], examples['hypothesis'],
        truncation=True, return_token_type_ids=True,
        max_length=MAX_LENGTH
    )

## Melakukan tokenisasi data IndoNLI

In [14]:
tokenized_data_indonli = data_indonli.map(
    preprocess_function_indonli,
    batched=True,
    load_from_cache_file=True,
    num_proc=1,
    remove_columns=['premise', 'hypothesis'],
    fn_kwargs={'tokenizer': tokenizer, 'MAX_LENGTH': MAX_LENGTH}
)



  0%|          | 0/393 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [15]:
tokenized_data_indonli.set_format("torch", columns=["input_ids", "token_type_ids"], output_all_columns=True, device=device)

In [16]:
tokenized_data_indonli_train = Dataset.from_dict(tokenized_data_indonli["train"][:SAMPLE])
tokenized_data_indonli_validation = Dataset.from_dict(tokenized_data_indonli["validation"][:SAMPLE])

# Tahapan fine-tune IndoNLI diatas IndoBERT

## Fungsi utilitas untuk komputasi metrik

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions, references=labels)

## Dictionary untuk mapping label

In [18]:
id2label = {0: 'entailment', 1: 'neutral', 
            2: 'contradiction'}
label2id = {'entailment': 0, 'neutral': 
            1, 'contradiction': 2}
accuracy = evaluate.load('accuracy')

## Gunakan model Sequence Classification yang sudah pre-trained

In [19]:
model_sc = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, 
    id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indober

In [20]:
model_sc = model_sc.to(device)

## Melakukan pengumpulan data dengan padding

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Mendefinisikan argumen (dataops) untuk training nanti

In [22]:
TIME_NOW = str(datetime.now()).replace(":", "-").replace(" ", "_").replace(".", "_")
NAME = 'IndoNLI-data_translated-with_IndoLEM'
SC = f'./results/{NAME}-{TIME_NOW}'

CHECKPOINT_DIR = f'{SC}/checkpoint/'
MODEL_DIR = f'{SC}/model/'
OUTPUT_DIR = f'{SC}/output/'
ACCURACY_DIR = f'{SC}/accuracy/'

REPO_NAME = f'fine-tuned-{NAME}'

In [23]:
training_args_sc = TrainingArguments(
    
    # Checkpoint
    output_dir=CHECKPOINT_DIR,
    overwrite_output_dir=True,
    save_strategy='epoch',
    save_total_limit=EPOCH,
    
    # Log
    report_to='tensorboard',
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=LOGGING_STEPS,
    
    # Train
    num_train_epochs=EPOCH,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    bf16=False,
    dataloader_num_workers=cpu_count(),
    
    # Miscellaneous
    evaluation_strategy='epoch',
    seed=SEED,
    push_to_hub=True,
    hub_model_id=REPO_NAME
)

## Mulai training untuk fine-tune IndoNLI diatas IndoBERT

In [24]:
trainer_sc = Trainer(
    model=model_sc,
    args=training_args_sc,
    train_dataset=tokenized_data_indonli_train,
    eval_dataset=tokenized_data_indonli_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/muhammadravi251001/fine-tuned-IndoNLI-data_translated-with_IndoLEM into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/422M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.62k/3.62k [00:00<?, ?B/s]

Download file runs/Feb23_09-11-33_muhammad-ravi-tensorrt-pod/1677143504.5574915/events.out.tfevents.1677143504…

Clean file training_args.bin:  28%|##7       | 1.00k/3.62k [00:00<?, ?B/s]

Clean file runs/Feb23_09-11-33_muhammad-ravi-tensorrt-pod/1677143504.5574915/events.out.tfevents.1677143504.mu…

Download file runs/Feb23_09-11-33_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677143504.muhammad-ravi-tens…

Clean file runs/Feb23_09-11-33_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677143504.muhammad-ravi-tensorr…

Clean file pytorch_model.bin:   0%|          | 1.00k/422M [00:00<?, ?B/s]

In [25]:
trainer_sc.train()

***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 1
  Number of trainable parameters = 110560515


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5684,1.061041,0.4


***** Running Evaluation *****
  Num examples = 25
  Batch size = 8
Saving model checkpoint to ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/checkpoint-1
Configuration saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/checkpoint-1/config.json
Model weights saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/checkpoint-1/pytorch_model.bin
tokenizer config file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/checkpoint-1/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/checkpoint-1/special_tokens_map.json
tokenizer config file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20

TrainOutput(global_step=1, training_loss=0.568421483039856, metrics={'train_runtime': 52.5918, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.019, 'total_flos': 1196857714626.0, 'train_loss': 0.568421483039856, 'epoch': 1.0})

## Simpan model Sequence Classification

In [26]:
trainer_sc.save_model(MODEL_DIR)

Saving model checkpoint to ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/model/
Configuration saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/model/config.json
Model weights saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/model/pytorch_model.bin
tokenizer config file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/model/tokenizer_config.json
Special tokens file saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/model/special_tokens_map.json
Saving model checkpoint to ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/
Configuration saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/config.json
Model weights saved in ./results/IndoNLI-data_translated-with_IndoLEM-2023-02-23_14-11-20_189226/checkpoint/pytorch_model.bin
tokenizer config file saved in 

Upload file pytorch_model.bin:   0%|          | 32.0k/422M [00:00<?, ?B/s]

Upload file runs/Feb23_14-11-20_muhammad-ravi-tensorrt-pod/events.out.tfevents.1677161545.muhammad-ravi-tensor…

Upload file runs/Feb23_14-11-20_muhammad-ravi-tensorrt-pod/1677161545.950902/events.out.tfevents.1677161545.mu…

Upload file training_args.bin: 100%|##########| 3.62k/3.62k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/muhammadravi251001/fine-tuned-IndoNLI-data_translated-with_IndoLEM
   fe02a6c..4b68df7  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.4}]}
To https://huggingface.co/muhammadravi251001/fine-tuned-IndoNLI-data_translated-with_IndoLEM
   4b68df7..d5a5895  main -> main



# Melakukan prediksi dari model

In [27]:
predict_result = trainer_sc.predict(tokenized_data_indonli_validation)

***** Running Prediction *****
  Num examples = 25
  Batch size = 8


In [28]:
os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
with open(f'{OUTPUT_DIR}/output.txt', "w") as f:
  f.write(str(predict_result))
  f.close()

# Melakukan evaluasi dari prediksi

In [29]:
def compute_accuracy(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions, references=labels)

In [30]:
accuracy_result = compute_accuracy(predict_result)

In [31]:
os.makedirs(os.path.dirname(ACCURACY_DIR), exist_ok=True)
with open(f'{ACCURACY_DIR}/accuracy.txt', "w") as f:
  f.write(str(accuracy_result))
  f.close()