<a href="https://colab.research.google.com/github/nadiarvi/indo-colloquial/blob/main/finetuning_IndoBart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# 0. INSTALLATIONS


##0.1 Pip Install

In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
!pip install datasets evaluate transformers[sentencepiece]



In [None]:
!pip install indobenchmark-toolkit



In [None]:
!pip install pandas
!pip install sklearn
!pip install seaborn
!pip install matplotlib
!pip install nltk
!pip install tqdm
!pip install rouge_score
!pip install scikit-learn
!pip install sacrebleu
!pip install xlsxwriter



## 0.2 Imports

In [None]:
!git clone https://github.com/IndoNLP/indonlg.git

fatal: destination path 'indonlg' already exists and is not an empty directory.


In [None]:
!ls

drive  gdrive  indonlg	sample_data  test.json	trained.json  validation.json


In [None]:
import os, sys

sys.path.append('./indonlg')

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import torch
import shutil
import random
import numpy as np
import pandas as pd
import indonlg
from torch import optim
from transformers import MBartForConditionalGeneration
from indonlg import utils


from indobenchmark import IndoNLGTokenizer
from utils.train_eval import train, evaluate
from utils.metrics import generation_metrics_fn
from utils.forward_fn import forward_generation
from utils.data_utils import MachineTranslationDataset, GenerationDataLoader

In [None]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

# Set random seed
# set_seed(26092020)

# 1. LOAD MODEL

In [None]:
bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart-v2')

model = bart_model

#2. PREPARE DATASETS

In [None]:
# configs and args

lr = 1e-4
gamma = 0.9
lower = True
step_size = 1
beam_size = 5
max_norm = 10
early_stop = 5

max_seq_len = 512
grad_accumulate = 1
no_special_token = False
swap_source_target = False
model_type = 'indo-bart'
valid_criterion = 'SacreBLEU'

separator_id = 4
speaker_1_id = 5
speaker_2_id = 6

train_batch_size = 8
valid_batch_size = 8
test_batch_size = 8

source_lang = "[indonesian]"
target_lang = "[indonesian]"

optimizer = optim.Adam(model.parameters(), lr=lr)
src_lid = tokenizer.special_tokens_to_ids[source_lang]
tgt_lid = tokenizer.special_tokens_to_ids[target_lang]

model.config.decoder_start_token_id = tgt_lid

# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

# # create directory
model_dir = '/content/gdrive/MyDrive/KAIST (School)/4.5 Individual Study/save2'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

device = 'cuda0'
# # set a specific cuda device
if "cuda" in device:
    torch.cuda.set_device(int(device[4:]))
    device = "cuda"
    model = model.cuda()

### prepare dataset

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
root = '/content/gdrive/MyDrive/Colab Notebooks/Indonesian Colloquial Language/'

train_df = pd.read_csv(root + 'train.csv')
validation_df = pd.read_csv(root + 'dev.csv')
test_df = pd.read_csv(root + 'test.csv')

In [None]:
train_df = train_df[["transformed", "original-for"]]
validation_df = validation_df[["transformed", "original-for"]]
test_df = test_df[["transformed", "original-for"]]

In [None]:
# set index to column, rename all columns into ['id', 'text', 'label']
dataframes = [train_df, validation_df, test_df]

for i in range(len(dataframes)):
  df = dataframes[i]
  df = df.reset_index().rename(columns={'index': 'id', 'transformed': 'text', 'original-for': 'label'})
  print(df.head())
  dataframes[i] = df

   id   text   label
0   0  sampe  sampai
1   1    dgn  dengan
2   2    org   orang
3   3  karna  karena
4   4     tu     itu
   id    text              label
0   0      op           operator
1   1     egp  emang gue pikirin
2   2    ojol        ojek online
3   3     kel          kelurahan
4   4  satker       satuan kerja
   id    text                 label
0   0  goblog                goblok
1   1  diklat      pendidikan kilat
2   2    50rb                 50000
3   3   busui          ibu menyusui
4   4   2 kmh  2 kilometer per hour


In [None]:
train_df = dataframes[0]
validation_df = dataframes[1]
test_df = dataframes[2]

In [None]:
train_df

Unnamed: 0,id,text,label
0,0,sampe,sampai
1,1,dgn,dengan
2,2,org,orang
3,3,karna,karena
4,4,tu,itu
...,...,...,...
1631,1631,kyak,kayak
1632,1632,jadul,jaman dulu
1633,1633,slamet,selamat
1634,1634,bgni,begini


In [None]:
train_json = train_df.to_json(orient='records')
validation_json = validation_df.to_json(orient='records')
test_json = test_df.to_json(orient='records')

In [None]:
with open('trained.json', 'w') as json_file:
  json_file.write(train_json)
  print("saved")

saved


In [None]:
with open('validation.json', 'w') as json_file:
  json_file.write(validation_json)
  print("saved")

saved


In [None]:
with open('test.json', 'w') as json_file:
  json_file.write(test_json)
  print("saved")

saved


### access dataset

In [None]:
train_dataset_path = '/content/trained.json'
valid_dataset_path = '/content/validation.json'
test_dataset_path = '/content/test.json'

In [None]:
train_dataset = MachineTranslationDataset(train_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token,
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
valid_dataset = MachineTranslationDataset(valid_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token,
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
test_dataset = MachineTranslationDataset(test_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token,
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)

In [None]:
train_loader = GenerationDataLoader(dataset=train_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len,
                                    batch_size=train_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=True)
valid_loader = GenerationDataLoader(dataset=valid_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len,
                                    batch_size=valid_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len,
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)

In [None]:
train_dataset[0]

(0, [810], [3369])

# 3. TRAIN MODEL

In [None]:
# Train

n_epochs = 10

train(model,
      train_loader=train_loader,
      valid_loader=valid_loader,
      optimizer=optimizer,
      forward_fn=forward_generation,
      metrics_fn=generation_metrics_fn,
      valid_criterion=valid_criterion,
      tokenizer=tokenizer,
      n_epochs=n_epochs,
      evaluate_every=1,
      early_stop=early_stop,
      grad_accum=grad_accumulate,
      step_size=step_size,
      gamma=gamma,
      max_norm=max_norm,
      model_type=model_type,
      beam_size=beam_size,
      max_seq_len=max_seq_len,
      model_dir=model_dir,
      exp_id=0,
      fp16="",
      device=device)

(Epoch 1) TRAIN LOSS:2.0132 LR:0.00010000: 100%|██████████| 205/205 [00:23<00:00,  8.84it/s]


(Epoch 1) TRAIN LOSS:2.0132 BLEU:74.49 SacreBLEU:76.08 ROUGE1:69.43 ROUGE2:53.73 ROUGEL:69.40 ROUGELsum:69.42 LR:0.00010000


VALID LOSS:2.9507: 100%|██████████| 23/23 [00:00<00:00, 25.30it/s]


(Epoch 1) VALID LOSS:2.9507 BLEU:0.00 SacreBLEU:0.00 ROUGE1:12.25 ROUGE2:2.21 ROUGEL:12.43 ROUGELsum:12.43


(Epoch 2) TRAIN LOSS:0.7623 LR:0.00009000: 100%|██████████| 205/205 [00:24<00:00,  8.32it/s]


(Epoch 2) TRAIN LOSS:0.7623 BLEU:84.84 SacreBLEU:85.99 ROUGE1:82.51 ROUGE2:71.77 ROUGEL:82.55 ROUGELsum:82.53 LR:0.00009000


VALID LOSS:3.4602: 100%|██████████| 23/23 [00:00<00:00, 26.20it/s]


(Epoch 2) VALID LOSS:3.4602 BLEU:0.00 SacreBLEU:0.00 ROUGE1:7.73 ROUGE2:1.38 ROUGEL:7.73 ROUGELsum:7.83
count stop: 1


(Epoch 3) TRAIN LOSS:0.2952 LR:0.00008100: 100%|██████████| 205/205 [00:22<00:00,  9.06it/s]


(Epoch 3) TRAIN LOSS:0.2952 BLEU:92.70 SacreBLEU:93.31 ROUGE1:92.48 ROUGE2:86.92 ROUGEL:92.48 ROUGELsum:92.45 LR:0.00008100


VALID LOSS:3.6156: 100%|██████████| 23/23 [00:01<00:00, 14.92it/s]


(Epoch 3) VALID LOSS:3.6156 BLEU:0.00 SacreBLEU:0.00 ROUGE1:8.84 ROUGE2:1.66 ROUGEL:9.12 ROUGELsum:9.12
count stop: 2


(Epoch 4) TRAIN LOSS:0.1881 LR:0.00007290: 100%|██████████| 205/205 [00:22<00:00,  8.99it/s]


(Epoch 4) TRAIN LOSS:0.1881 BLEU:95.30 SacreBLEU:95.73 ROUGE1:95.30 ROUGE2:91.87 ROUGEL:95.27 ROUGELsum:95.28 LR:0.00007290


VALID LOSS:3.5105: 100%|██████████| 23/23 [00:00<00:00, 25.74it/s]


(Epoch 4) VALID LOSS:3.5105 BLEU:0.00 SacreBLEU:0.00 ROUGE1:10.13 ROUGE2:0.83 ROUGEL:10.04 ROUGELsum:10.13
count stop: 3


(Epoch 5) TRAIN LOSS:0.1214 LR:0.00006561: 100%|██████████| 205/205 [00:23<00:00,  8.77it/s]


(Epoch 5) TRAIN LOSS:0.1214 BLEU:96.98 SacreBLEU:97.29 ROUGE1:96.88 ROUGE2:94.53 ROUGEL:96.88 ROUGELsum:96.88 LR:0.00006561


VALID LOSS:3.8195: 100%|██████████| 23/23 [00:00<00:00, 25.46it/s]


(Epoch 5) VALID LOSS:3.8195 BLEU:0.00 SacreBLEU:0.00 ROUGE1:10.22 ROUGE2:2.21 ROUGEL:10.41 ROUGELsum:10.22
count stop: 4


(Epoch 6) TRAIN LOSS:0.0866 LR:0.00005905: 100%|██████████| 205/205 [00:22<00:00,  9.05it/s]


(Epoch 6) TRAIN LOSS:0.0866 BLEU:98.03 SacreBLEU:98.27 ROUGE1:97.86 ROUGE2:96.42 ROUGEL:97.86 ROUGELsum:97.84 LR:0.00005905


VALID LOSS:3.6240: 100%|██████████| 23/23 [00:01<00:00, 16.50it/s]


(Epoch 6) VALID LOSS:3.6240 BLEU:0.00 SacreBLEU:0.00 ROUGE1:11.23 ROUGE2:2.49 ROUGEL:11.33 ROUGELsum:11.14
count stop: 5


# 4. TEST MODEL

In [None]:
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

<All keys matched successfully>

In [None]:
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation,
                                                         metrics_fn=generation_metrics_fn, model_type=model_type,
                                                         tokenizer=tokenizer, beam_size=beam_size,
                                                         max_seq_len=max_seq_len, is_test=True,
                                                         device='cuda')

TESTING... : 100%|██████████| 24/24 [00:05<00:00,  4.29it/s]


In [None]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp,
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")

== Prediction Result ==
                               hyp                  label
0                     mengumpulkan                 goblok
1                      tugas tugas       pendidikan kilat
2                          50 ribu                  50000
3                       dikirimkan           ibu menyusui
4   2 kilometer per hour hour hour   2 kilometer per hour

== Model Performance ==
            BLEU  SacreBLEU     ROUGE1    ROUGE2     ROUGEL  ROUGELsum
count   1.000000   1.000000   1.000000  1.000000   1.000000      1.000
mean   13.385789  13.385789  12.065972  2.473958  11.918403     11.875
std          NaN        NaN        NaN       NaN        NaN        NaN
min    13.385789  13.385789  12.065972  2.473958  11.918403     11.875
25%    13.385789  13.385789  12.065972  2.473958  11.918403     11.875
50%    13.385789  13.385789  12.065972  2.473958  11.918403     11.875
75%    13.385789  13.385789  12.065972  2.473958  11.918403     11.875
max    13.385789  13.385789  12.065

In [None]:
result_df

Unnamed: 0,hyp,label
0,mengumpulkan,goblok
1,tugas tugas,pendidikan kilat
2,50 ribu,50000
3,dikirimkan,ibu menyusui
4,2 kilometer per hour hour hour,2 kilometer per hour
...,...,...
187,edap,sepeda
188,mengumpulkan,motor
189,asa,biasa
190,mengumpulkan,tidur
