# Menjalankan QA Tanpa Intermediate Task - Transfer Learning

# Import semua module

In [1]:
#!pip install datasets
#!pip install transformers
#!pip install tensorboard
#!pip install evaluate
#!pip install git+https://github.com/IndoNLP/nusa-crowd.git@release_exp

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [3]:
!pip install -r requirements.txt

Collecting nusacrowd@ git+https://github.com/IndoNLP/nusa-crowd.git@7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Cloning https://github.com/IndoNLP/nusa-crowd.git (to revision 7748513d20331e72f9969f94f5d43c7f2d4a59a5) to /tmp/pip-install-2ik76knw/nusacrowd_643ccb72285a4fbc95582ad319afaa6c
  Running command git clone --filter=blob:none -q https://github.com/IndoNLP/nusa-crowd.git /tmp/pip-install-2ik76knw/nusacrowd_643ccb72285a4fbc95582ad319afaa6c
  Running command git rev-parse -q --verify 'sha^7748513d20331e72f9969f94f5d43c7f2d4a59a5'
  Running command git fetch -q https://github.com/IndoNLP/nusa-crowd.git 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Running command git checkout -q 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Resolved https://github.com/IndoNLP/nusa-crowd.git to commit 7748513d20331e72f9969f94f5d43c7f2d4a59a5
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
# Melihat GPU yang tersedia dan penggunaannya.
!nvidia-smi

Sun Mar 12 12:46:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   39C    P0    58W / 300W |   7556MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   45C    P0    74W / 300W |  31238MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [5]:
# Memilih GPU yang akan digunakan (contohnya: GPU #7)
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [6]:
import transformers
import evaluate
import torch
import operator
import ast
import json
import re
import sys

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from torch.utils.data import DataLoader
from datetime import datetime
from huggingface_hub import notebook_login
from tqdm import tqdm

from datasets import (
    load_dataset, 
    load_from_disk,
    Dataset,
    DatasetDict
)
from transformers import (
    BigBirdTokenizerFast,
    BigBirdForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BertForSequenceClassification,
    BertForQuestionAnswering,
    AutoModel, 
    BertTokenizerFast,
    AutoTokenizer, 
    AutoModel, 
    BertTokenizer, 
    BertForPreTraining,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    EvalPrediction,
)

# Definisikan hyperparameter

In [7]:
MODEL_NAME = "indolem/indobert-base-uncased"
#MODEL_NAME = "afaji/fine-tuned-IndoNLI-Translated-with-indobert-base-uncased"
SEED = 42
EPOCH = 4
BATCH_SIZE = 32
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 1e-5
MAX_LENGTH = 400
STRIDE = 100
LOGGING_STEPS = 50
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
# Untuk mempercepat training, saya ubah SAMPLE menjadi 100.
# Bila mau menggunakan keseluruhan data, gunakan: 
# SAMPLE = sys.maxsize
SAMPLE = 10

# Import dataset QAS

In [8]:
!pip install git+https://github.com/IndoNLP/nusa-crowd.git

Collecting git+https://github.com/IndoNLP/nusa-crowd.git
  Cloning https://github.com/IndoNLP/nusa-crowd.git to /tmp/pip-req-build-h0jravka
  Running command git clone --filter=blob:none -q https://github.com/IndoNLP/nusa-crowd.git /tmp/pip-req-build-h0jravka
  Resolved https://github.com/IndoNLP/nusa-crowd.git to commit bea0aedb653c65d2dbc65a5f7b6950bd2cad274d
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Definisikan tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Definisikan fungsi pre-processnya

In [11]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'tydiqa_id' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_validation = pd.DataFrame(data_qas_id['validation'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in range(len(df_train['context'])):
    answer_start = df_train['context'][i].index(df_train['label'][i])
    answer_end = answer_start + len(df_train['label'][i])
    new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                        'question': df_train["question"][i], 
                                        'answer': {"text": df_train["label"][i], 
                                                   "answer_start": answer_start, 
                                                   "answer_end": answer_end}}, 
                                                   ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)    

for i in range(len(df_validation['context'])):
    answer_start = df_validation['context'][i].index(df_validation['label'][i])
    answer_end = answer_start + len(df_validation['label'][i])
    new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                    'question': df_validation["question"][i], 
                                    'answer': {"text": df_validation["label"][i], 
                                               "answer_start": answer_start, 
                                               "answer_end": answer_end}}, 
                                               ignore_index=True)    

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)

data_qas_id = DatasetDict({"train": train_dataset, "validation": validation_dataset})



  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
def rindex(lst, value, operator=operator):
      return len(lst) - operator.indexOf(reversed(lst), value) - 1

def preprocess_function_qa(examples, tokenizer, MAX_LENGTH=MAX_LENGTH, STRIDE=STRIDE, rindex=rindex, operator=operator):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]

    tokenized_examples = tokenizer(
      examples['question'],
      examples['context'],
      truncation=True,
      max_length = MAX_LENGTH,
      stride=STRIDE,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length",
      return_tensors='np'
    )

    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []

    for seq_idx in range(len(tokenized_examples['input_ids'])):
        seq_ids = tokenized_examples.sequence_ids(seq_idx)
        offset_mappings = tokenized_examples['offset_mapping'][seq_idx]

        cur_example_idx = tokenized_examples['overflow_to_sample_mapping'][seq_idx]

        #answer = examples['answer'][seq_idx][0]
        answer = examples['answer'][cur_example_idx]
        answer = eval(str(answer))
        #answer_text = answer['text'][0]
        answer_start = answer['answer_start']
        #answer_end = answer_start + len(answer_text)
        answer_end = answer['answer_end']

        context_pos_start = seq_ids.index(1)
        context_pos_end = rindex(seq_ids, 1, operator)

        s = e = 0
        if (offset_mappings[context_pos_start][0] <= answer_start and
            offset_mappings[context_pos_end][1] >= answer_end):
          i = context_pos_start
          while offset_mappings[i][0] < answer_start:
            i += 1
          if offset_mappings[i][0] == answer_start:
            s = i
          else:
            s = i - 1

          j = context_pos_end
          while offset_mappings[j][1] > answer_end:
            j -= 1      
          if offset_mappings[j][1] == answer_end:
            e = j
          else:
            e = j + 1

        tokenized_examples['start_positions'].append(s)
        tokenized_examples['end_positions'].append(e)
    return tokenized_examples

# Mulai tokenisasi dan pre-process

In [13]:
tokenized_data_qas_id = data_qas_id.map(
    preprocess_function_qa,
    batched=True,
    remove_columns=data_qas_id['train'].column_names,
    num_proc=1,
    fn_kwargs={'tokenizer': tokenizer, 'MAX_LENGTH': MAX_LENGTH, 'STRIDE': STRIDE, 'rindex': rindex, 'operator': operator}
)



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
tokenized_data_qas_id = tokenized_data_qas_id.remove_columns(["offset_mapping", 
                                            "overflow_to_sample_mapping"])

In [15]:
tokenized_data_qas_id.set_format("torch", columns=["input_ids", "token_type_ids"], output_all_columns=True, device=device)

In [16]:
tokenized_data_qas_id_train = Dataset.from_dict(tokenized_data_qas_id["train"][:SAMPLE])
tokenized_data_qas_id_validation = Dataset.from_dict(tokenized_data_qas_id["validation"][:SAMPLE])

# Mendefinisikan argumen (dataops) untuk training nanti

In [17]:
TIME_NOW = str(datetime.now()).replace(":", "-").replace(" ", "_").replace(".", "_")
QA = './results/alur2-idk-mrc'
CHECKPOINT_DIR = f'{QA}-{TIME_NOW}/checkpoint/'
MODEL_DIR = f'{QA}-{TIME_NOW}/model/'
OUTPUT_DIR = f'{QA}-{TIME_NOW}/output/'
ACCURACY_DIR = f'{QA}-{TIME_NOW}/accuracy/'

# Mendefinisikan Training Arguments untuk train

In [18]:
training_args_qa = TrainingArguments(
    
    # Checkpoint
    output_dir=CHECKPOINT_DIR,
    save_strategy='epoch',
    save_total_limit=EPOCH,
    
    # Log
    report_to='tensorboard',
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=LOGGING_STEPS,
    
    # Train
    num_train_epochs=EPOCH,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    bf16=False,
    dataloader_num_workers=cpu_count(),
    
    # Miscellaneous
    evaluation_strategy='epoch',
    seed=SEED,
)

# Pendefinisian model Question Answering

In [19]:
model_qa = BertForQuestionAnswering.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at indolem/indobert-base-uncased and a

In [20]:
model_qa = model_qa.to(device)

# Melakukan pengumpulan data dengan padding

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Mulai training untuk fine-tune SQUAD diatas IndoBERT

In [22]:
import string
import collections

# # Melakukan evaluasi dari prediksi
def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1_prec_rec(pred, gold):
    pred_tokens = normalize_text(pred).split() # True positive + False positive = Untuk precision
    gold_tokens = normalize_text(gold).split() # True positive + False negatives = Untuk recall
    common = collections.Counter(pred_tokens) & collections.Counter(gold_tokens)
    num_same = sum(common.values()) # True positive

    if len(gold_tokens) == 0 or len(pred_tokens) == 0: 
        var = int(gold_tokens == pred_tokens)
        return var, var, var

    if num_same == 0:
        return 0, 0, 0

    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(gold_tokens)
    f1 = (2.0 * precision * recall) / (precision + recall)

    return f1, precision, recall

def compute_metrics(predict_result):
    predictions_idx = np.argmax(predict_result.predictions, axis=2)
    denominator = len(predictions_idx[0])
    label_array = np.asarray(predict_result.label_ids)
    total_correct = 0
    f1_array = []
    precision_array = []
    recall_array = []

    for i in range(len(predict_result.predictions[0])):
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1

        pred_text = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                    [start_pred_idx: end_pred_idx])
        gold_text = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                    [start_gold_idx: end_gold_idx])

        if pred_text == gold_text:
            total_correct += 1

        f1, precision, recall = compute_f1_prec_rec(pred=pred_text, gold=gold_text)

        f1_array.append(f1)
        precision_array.append(precision)
        recall_array.append(recall)

    exact_match = ((total_correct / denominator) * 100.0)
    final_f1 = np.mean(f1_array) * 100.0
    final_precision = np.mean(precision_array) * 100.0
    final_recall = np.mean(recall_array) * 100.0

    return {'exact_match': exact_match, 'f1': final_f1, 'precision': final_precision, 'recall': final_recall}

In [23]:
trainer_qa = Trainer(
    model=model_qa.to(device),
    args=training_args_qa,
    train_dataset=tokenized_data_qas_id_train,
    eval_dataset=tokenized_data_qas_id_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [24]:
trainer_qa.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 4
  Number of trainable parameters = 109969154


Epoch,Training Loss,Validation Loss,Exact Match,F1,Precision,Recall
1,1.4979,5.906729,0.0,1.374825,0.706083,37.0
2,1.4979,5.827014,0.0,2.702447,1.449239,28.666667
3,1.4979,5.772928,0.0,2.661411,1.429674,26.666667
4,1.4979,5.745949,0.0,2.661411,1.429674,26.666667


***** Running Evaluation *****
  Num examples = 10
  Batch size = 8
Saving model checkpoint to ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-1
Configuration saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-1/config.json
Model weights saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-1/pytorch_model.bin
tokenizer config file saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-1/tokenizer_config.json
Special tokens file saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-1/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8
Saving model checkpoint to ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-2
Configuration saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/checkpoint/checkpoint-2/config.json
Model weights saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_3

TrainOutput(global_step=4, training_loss=1.4759136140346527, metrics={'train_runtime': 80.1311, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.05, 'total_flos': 8165523648000.0, 'train_loss': 1.4759136140346527, 'epoch': 4.0})

# Menyimpan model Question Answering

In [25]:
trainer_qa.save_model(MODEL_DIR)

Saving model checkpoint to ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/model/
Configuration saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/model/config.json
Model weights saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/model/pytorch_model.bin
tokenizer config file saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/model/tokenizer_config.json
Special tokens file saved in ./results/alur2-idk-mrc-2023-03-12_12-47-19_390553/model/special_tokens_map.json


# Melakukan prediksi dari model

In [26]:
predict_result = trainer_qa.predict(tokenized_data_qas_id_validation)
predict_result

***** Running Prediction *****
  Num examples = 10
  Batch size = 8


PredictionOutput(predictions=(array([[-0.33184677,  0.07781378,  0.2636203 , ...,  0.10889205,
         0.42579746, -0.06319544],
       [ 0.4163932 ,  0.2082016 ,  0.25819936, ...,  0.01471319,
        -0.37595367, -0.5726742 ],
       [-0.04473067,  0.26464334,  0.73239183, ...,  0.26935154,
        -0.09526553, -0.04435409],
       ...,
       [ 0.29223436,  0.06003963,  0.3735975 , ..., -0.65672666,
        -0.55797344, -0.58590394],
       [-0.37611818,  0.39521438,  0.47991115, ..., -0.528477  ,
        -0.2068925 , -1.396406  ],
       [ 0.06061195,  0.48033774,  0.6855728 , ..., -0.42426658,
        -0.21605156, -0.7319156 ]], dtype=float32), array([[ 0.23784003,  0.06880783,  0.09853338, ...,  0.11003547,
         0.10829221,  0.431145  ],
       [-0.4383464 , -0.20277022, -0.08410891, ..., -0.40091342,
        -0.39779922, -0.18528306],
       [-0.39130634, -0.46281058,  0.05561218, ..., -0.61256135,
        -0.898312  , -0.82619417],
       ...,
       [-0.01081342,  0.40049

In [27]:
os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
with open(f'{OUTPUT_DIR}/output.txt', "w") as f:
  f.write(str(predict_result))
  f.close()

# Melakukan evaluasi dari prediksi

In [28]:
metric_result_before_filtering = compute_metrics(predict_result)
metric_result_before_filtering

{'exact_match': 0.0,
 'f1': 2.6614113018379757,
 'precision': 1.4296741925116505,
 'recall': 26.666666666666668}

In [29]:
# TODO: ubah direktori

os.makedirs(os.path.dirname(ACCURACY_DIR), exist_ok=True)
with open(f'{ACCURACY_DIR}/accuracy.txt', "w") as f:
  f.write(str(metric_result_before_filtering))
  f.close()

## Coba Alur 2

In [30]:
from tqdm import tqdm

def create_qas_dataframe(predict_result=predict_result, index_largest=1):
    predictions_idx = np.argsort(predict_result.predictions, axis=2)[:, :, index_largest * -1]
    #predictions_idx = np.argmax(predict_result.predictions, axis=2)
    label_array = np.asarray(predict_result.label_ids)
    question_decoded = []
    context_decoded = []
    pred_answer_decoded = []
    gold_answer_decoded = []
    
    for i in tqdm(range(len(predict_result.predictions[0]))):
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        pred_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_pred_idx: end_pred_idx], skip_special_tokens=True)
        pred_answer_decoded.append(pred_answer)
        
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1
        gold_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_gold_idx: end_gold_idx], skip_special_tokens=True)
        gold_answer_decoded.append(gold_answer)
        
        question = []
        context = []
         
        for j in range(len(tokenized_data_qas_id_validation[i]['token_type_ids'])):
            if tokenized_data_qas_id_validation[i]['token_type_ids'][j] == 0:
                question.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
            else:
                context.append(tokenized_data_qas_id_validation[i]['input_ids'][j])

        question_decoded.append(tokenizer.decode(question, skip_special_tokens=True))
        context_decoded.append(tokenizer.decode(context, skip_special_tokens=True))
    
    qas_df = pd.DataFrame({'Context': context_decoded, 
                           'Question': question_decoded, 
                           'Prediction Answer': pred_answer_decoded,
                          'Gold Answer': gold_answer_decoded})
                      
    return qas_df

In [31]:
qas_df = create_qas_dataframe(predict_result)
qas_df

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.67it/s]


Unnamed: 0,Context,Question,Prediction Answer,Gold Answer
0,kolumbus bukanlah orang pertama yang tiba di a...,siapakah yang menemuka benua amerika?,amerika oleh berbagai orang sepanjang masa itu.,orang - orang viking dari eropa utara
1,kabupaten donggala ( english : donggala regenc...,dimanakah letak donggala?,administrasi terletak di kota donggala. kabupa...,"provinsi sulawesi tengah, indonesia"
2,awal mula teknik industri dapat ditelusuri dar...,siapa bapak teknik industri?,,frederick winslow taylor
3,penghulu rasyid ( lahir di desa telaga itar ta...,kapan penghulu rasyid meninggal?,,15 desember 1861
4,samudra pasifik atau lautan teduh ( dari bahas...,seberapa luas kah samudera pasifik?,,"179, 7 juta km²"
5,"geisha ( bahasa jepang : 者 "" seniman "" ) adala...",apakah yang dimaksud denga geisha?,"19, dan masih ada sampai sekarang ini, walaupu...",seniman - penghibur ( entertainer ) tradisiona...
6,pada periode 2000 - an bca memperkuat dan meng...,kapan bank bca mengeluarkan kartu debit?,,2000 - an
7,"general motors company ( nyse : ), juga dikena...",dimana kantor pusat general motors?,,"renaissance center di detroit, michigan, ameri..."
8,"selanjutnya, berdasarkan ketentuan dalam undan...",berapa luas kota blitar?,berdasarkan undang - undang no. 18 / 1965. ber...,"32, 58 km²"
9,"harumichi bouya ( 春 道, harumichi boya ), atau ...",siapa yang menciptakan serial manga crows?,"crows? harumichi bouya ( 春 道, harumichi boya )...",hiroshi takahashi


In [32]:
question_mark = ['siapa', 'siapakah',
                    'apa', 'apakah', 'adakah',
                    'dimana', 'dimanakah', 'darimanakah',
                    'kapan', 'kapankah',
                    'bagaimana', 'bagaimanakah',
                    'kenapa', 'mengapa',
                    'berapa', 'berapakah', 'seberapa']

In [33]:
hipo = "kapankah musik hip hop kah pertama kali muncul?"

for i in hipo.split():
    if (i == "kah") and (i not in question_mark):
        print(i)

kah


In [34]:
def create_nli_dataframe(df, type='replace first'):
    
    nli_df = pd.DataFrame()
    
    question_mark = ['siapa', 'siapakah',
                    'apa', 'apakah', 'adakah',
                    'dimana', 'dimanakah', 'darimanakah',
                    'kapan', 'kapankah',
                    'bagaimana', 'bagaimanakah',
                    'kenapa', 'mengapa',
                    'berapa', 'berapakah', 'seberapa']
    
    if type == 'rule based':
        for i in tqdm(range(df.shape[0])):
            premise = df['Context'][i]
            hypothesis = df['Question'][i]
            
            hypothesis = hypothesis.replace('kah', '')
            
            for j in hypothesis.split():
                if j in question_mark:
                    if j == 'siapa' or j == 'siapakah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} adalah {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} adalah {df['Gold Answer'][i]}"
                    
                    elif j == 'apa' or j == 'apakah' or j == 'adakah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} adalah {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} adalah {df['Gold Answer'][i]}"
                        
                    elif j == 'dimana' or j == 'dimanakah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} di {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} di {df['Gold Answer'][i]}"
                    
                    elif j == 'darimanakah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} dari {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} dari {df['Gold Answer'][i]}"
                    
                    elif j == 'kapan' or j == 'kapankah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} pada {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} pada {df['Gold Answer'][i]}"

                    elif j == 'bagaimana' or j == 'bagaimanakah':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} adalah {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} adalah {df['Gold Answer'][i]}"
                    
                    elif j == 'kenapa' or j == 'mengapa':
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, 'alasan')
                        pred_hypothesis = f"{pred_hypothesis} adalah karena {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, 'alasan')
                        gold_hypothesis = f"{gold_hypothesis} adalah karena {df['Gold Answer'][i]}"
                    
                    elif j == 'berapa' or j == 'berapakah' or j == 'seberapa': 
                        pred_hypothesis = hypothesis.replace('?', '')
                        pred_hypothesis = pred_hypothesis.replace(j, '')
                        pred_hypothesis = f"{pred_hypothesis} adalah {df['Prediction Answer'][i]}"

                        gold_hypothesis = hypothesis.replace('?', '')
                        gold_hypothesis = gold_hypothesis.replace(j, '')
                        gold_hypothesis = f"{gold_hypothesis} adalah {df['Gold Answer'][i]}"
                    
            nli_df = nli_df.append({'Premise': premise,
                           'Prediction Hypothesis': pred_hypothesis,
                            'Gold Hypothesis': gold_hypothesis
                           }, ignore_index=True)

    elif type == 'replace first':
        for i in tqdm(range(df.shape[0])):
            premise = df['Context'][i]
            hypothesis = df['Question'][i]
            pred_hypothesis = hypothesis.replace('?', '')
            pred_hypothesis = pred_hypothesis.replace(hypothesis.split()[0], 
                                            df['Prediction Answer'][i])

            gold_hypothesis = hypothesis.replace('?', '')
            gold_hypothesis = gold_hypothesis.replace(hypothesis.split()[0], 
                                            df['Gold Answer'][i])
            
            nli_df = nli_df.append({'Premise': premise,
                               'Prediction Hypothesis': pred_hypothesis,
                                'Gold Hypothesis': gold_hypothesis
                               }, ignore_index=True)
    
    elif type == 'replace question mark':
        for i in tqdm(range(df.shape[0])):
            premise = df['Context'][i]
            hypothesis = df['Question'][i]
            for j in hypothesis.split():
                if j in question_mark:
                    pred_hypothesis = hypothesis.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, df['Prediction Answer'][i])
                    
                    gold_hypothesis = hypothesis.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, df['Gold Answer'][i])
                
            nli_df = nli_df.append({'Premise': premise,
                                   'Prediction Hypothesis': pred_hypothesis,
                                    'Gold Hypothesis': gold_hypothesis
                                   }, ignore_index=True)
    
    elif type == 'machine generation': pass # TODO
    
    elif type == 'add adalah': 
        for i in tqdm(range(df.shape[0])):
            premise = df['Context'][i]
            hypothesis = df['Question'][i]
            pred_hypothesis = hypothesis.replace('?', '')
            pred_hypothesis = pred_hypothesis.replace(hypothesis.split()[0], '')
            pred_hypothesis = f"{pred_hypothesis} adalah {df['Prediction Answer'][i]}"

            gold_hypothesis = hypothesis.replace('?', '')
            gold_hypothesis = gold_hypothesis.replace(hypothesis.split()[0], '')
            gold_hypothesis = f"{gold_hypothesis} adalah {df['Gold Answer'][i]}"
            
            nli_df = nli_df.append({'Premise': premise,
                               'Prediction Hypothesis': pred_hypothesis,
                                'Gold Hypothesis': gold_hypothesis
                               }, ignore_index=True)
    
    elif type == 'just concat answer and question':
        for i in tqdm(range(df.shape[0])):
            premise = df['Context'][i]
            hypothesis = df['Question'][i]

            pred_hypothesis = f"{hypothesis} {df['Prediction Answer'][i]}"         
            gold_hypothesis = f"{hypothesis} {df['Gold Answer'][i]}"
                
            nli_df = nli_df.append({'Premise': premise,
                                   'Prediction Hypothesis': pred_hypothesis,
                                    'Gold Hypothesis': gold_hypothesis
                                   }, ignore_index=True)
    '''
    kata_tanya = []
    for i in tqdm(range(df.shape[0])): 
        hypothesis = df['Question'][i]
        kata_tanya.append(hypothesis.split()[0])
    unik = set(np.unique(kata_tanya))
    temp3 = [x for x in unik if x not in question_mark]
    print(temp3)
    '''
        
    return nli_df

In [35]:
nli_df = create_nli_dataframe(qas_df, type='rule based')
nli_df

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 411.88it/s]


Unnamed: 0,Premise,Prediction Hypothesis,Gold Hypothesis
0,kolumbus bukanlah orang pertama yang tiba di a...,yang menemuka benua amerika adalah amerika ol...,yang menemuka benua amerika adalah orang - or...
1,kabupaten donggala ( english : donggala regenc...,letak donggala di administrasi terletak di ko...,"letak donggala di provinsi sulawesi tengah, i..."
2,awal mula teknik industri dapat ditelusuri dar...,bapak teknik industri adalah,bapak teknik industri adalah frederick winslo...
3,penghulu rasyid ( lahir di desa telaga itar ta...,penghulu rasyid meninggal pada,penghulu rasyid meninggal pada 15 desember 1861
4,samudra pasifik atau lautan teduh ( dari bahas...,luas samudera pasifik adalah,"luas samudera pasifik adalah 179, 7 juta km²"
5,"geisha ( bahasa jepang : 者 "" seniman "" ) adala...","yang dimaksud denga geisha adalah 19, dan mas...",yang dimaksud denga geisha adalah seniman - p...
6,pada periode 2000 - an bca memperkuat dan meng...,bank bca mengeluarkan kartu debit pada,bank bca mengeluarkan kartu debit pada 2000 - an
7,"general motors company ( nyse : ), juga dikena...",kantor pusat general motors di,kantor pusat general motors di renaissance ce...
8,"selanjutnya, berdasarkan ketentuan dalam undan...",luas kota blitar adalah berdasarkan undang - ...,"luas kota blitar adalah 32, 58 km²"
9,"harumichi bouya ( 春 道, harumichi boya ), atau ...",yang menciptakan serial manga crows adalah cr...,yang menciptakan serial manga crows adalah hi...


## Mencoba cara retrieve model dari HF

In [36]:
from transformers import pipeline

pretrained_name_sc = "afaji/fine-tuned-IndoNLI-Augmented-with-indobert-base-uncased"
pretrained_name_qa = "afaji/fine-tuned-DatasetQAS-TYDI-QA-ID-with-indobert-base-uncased-with-ITTL-without-freeze-LR-1e-05"
tokenizer_kwargs = {'padding': True, 'truncation': True, 'max_length': MAX_LENGTH}

nlp_sc = pipeline(task="text-classification", model=pretrained_name_sc, tokenizer=pretrained_name_sc, 
                  device=torch.cuda.current_device(), **tokenizer_kwargs)
nlp_qa = pipeline(task="question-answering", model=pretrained_name_qa, tokenizer=pretrained_name_qa, 
                  device=torch.cuda.current_device(), **tokenizer_kwargs)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--afaji--fine-tuned-IndoNLI-Augmented-with-indobert-base-uncased/snapshots/baf8065c541ffd323cf43d1e93868ccbb20febbd/config.json
Model config BertConfig {
  "_name_or_path": "afaji/fine-tuned-IndoNLI-Augmented-with-indobert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "positi

In [37]:
x = nlp_sc({'text': nli_df['Premise'][0], 'text_pair': nli_df['Prediction Hypothesis'][0]}, top_k=3)
x

[{'label': 'entailment', 'score': 0.9969819188117981},
 {'label': 'contradiction', 'score': 0.0017156351823359728},
 {'label': 'neutral', 'score': 0.0013024398358538747}]

In [38]:
nlp_qa(question=qas_df['Question'][1], context=qas_df['Context'][1], device=device)

{'score': 0.3111673891544342,
 'start': 78,
 'end': 113,
 'answer': 'provinsi sulawesi tengah, indonesia'}

## Coba buat method evaluasi

In [None]:
def smoothing(question, context, type):
    for j in hypothesis.split():
        if j in question_mark:
            pred_hypothesis = hypothesis.replace('?', '')
            pred_hypothesis = pred_hypothesis.replace(j, df['Prediction Answer'][i])

            gold_hypothesis = hypothesis.replace('?', '')
            gold_hypothesis = gold_hypothesis.replace(j, df['Gold Answer'][i])

In [39]:
MAXIMUM_SEARCH_ITER =  2

def evaluation(predict_result, type_smoothing="replace first", type_qas="TODO", MAXIMUM_SEARCH_ITER=MAXIMUM_SEARCH_ITER):
    
    # Ekstrak dari PredictionOutput QAS
    predictions_idx = np.argsort(predict_result.predictions, axis=2)[:, :, 1 * -1]
    label_array = np.asarray(predict_result.label_ids)
    
    question_decoded = []
    context_decoded = []
    pred_answer_decoded = []
    gold_answer_decoded = []
    
    question_d = None
    context_d = None
    
    # Iterasi ini ditujukan untuk retrieve answer
    for i in tqdm(range(len(predict_result.predictions[0]))):
        
        isFoundBiggest = False
        
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1
        
        # Retrieve answer prediksi
        pred_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_pred_idx: end_pred_idx], skip_special_tokens=True)
        
        # Retrieve answer gold
        gold_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_gold_idx: end_gold_idx], skip_special_tokens=True)
        
        question = []
        context = []
        
        # Iterasi ini untuk retrieve question dan context index yang bersangkutan
        for j in range(len(tokenized_data_qas_id_validation[i]['token_type_ids'])):
            
            # Bila token_type_ids-nya 0, maka itu question (sesuai dengan urutan tokenisasi)
            if tokenized_data_qas_id_validation[i]['token_type_ids'][j] == 0:
                question.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
            
            # Bila token_type_ids-nya 1, maka itu context (sesuai dengan urutan tokenisasi)
            else:
                context.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
            
        # Bagian untuk Smoothing (On-Progress, sudah dikerjakan diatas, belum "ditempel" kesini)
        
        question_outside_loop = tokenizer.decode(question, skip_special_tokens=True)
        context_outside_loop = tokenizer.decode(context, skip_special_tokens=True)
        smoothing_hypothesis = 

        # Cek label dari answer prediksi dan context
        predicted_label = nlp_sc({'text': pred_answer, 'text_pair': context_outside_loop}, 
                                 **tokenizer_kwargs)['label']

        # Cek label dari answer prediksi dan context, bila labelnya entailment, maka answernya jadi hasil akhir
        if predicted_label == 'entailment':
            question_decoded.append(tokenizer.decode(question, skip_special_tokens=True))
            context_decoded.append(tokenizer.decode(context, skip_special_tokens=True))
            pred_answer_decoded.append(pred_answer)
            gold_answer_decoded.append(gold_answer)

        # Cek label dari answer prediksi dan context, bila labelnya bukan entailment, maka masuk ke for-loop untuk
        # -- iterasi ke argmax selanjutnya, dengan menggunakan argsort
        else:

            # Bila MAXIMUM_SEARCH_ITER dibawah 2, maka continue langsung
            if MAXIMUM_SEARCH_ITER < 2: continue

            # Bila MAXIMUM_SEARCH_ITER diatas 2, maka continue langsung
            else:

                # Bila bukan entailment, loop sebanyak MAXIMUM_SEARCH_ITER kali.
                for index_largest in range(MAXIMUM_SEARCH_ITER - 1):

                    # Cari di index kedua, ketiga, keempat, dan seterusnya
                    predictions_idx_inside_loop = np.argsort(predict_result.predictions, 
                                                             axis=2)[:, :, (index_largest + 2) * -1]

                    start_pred_idx = predictions_idx_inside_loop[0][i]
                    end_pred_idx = predictions_idx_inside_loop[1][i] + 1

                    # Retrieve answer prediksi
                    pred_answer_inside_loop = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                                   [start_pred_idx: end_pred_idx], skip_special_tokens=True)
                    
                    question_inside_loop = tokenizer.decode(question, skip_special_tokens=True)
                    context_inside_loop = tokenizer.decode(context, skip_special_tokens=True)

                    # Cek label dari answer prediksi dan context
                    predicted_label_inside_loop = nlp_sc({'text': pred_answer_inside_loop, 'text_pair': context_inside_loop}
                                       , **tokenizer_kwargs)['label']

                    # Bila label-nya sudah entailment, maka answernya jadi hasil akhir, dan break
                    if predicted_label_inside_loop == 'entailment':
                        isFoundBiggest = True
                        question_decoded.append(tokenizer.decode(question, skip_special_tokens=True))
                        context_decoded.append(tokenizer.decode(context, skip_special_tokens=True))
                        pred_answer_decoded.append(pred_answer_inside_loop)
                        gold_answer_decoded.append(gold_answer)   
                        break

                if isFoundBiggest == False:
                    # Bila sampai iterasi terakhir, belum entailment juga, maka append saja jawaban kosong
                    question_decoded.append(tokenizer.decode(question, skip_special_tokens=True))
                    context_decoded.append(tokenizer.decode(context, skip_special_tokens=True))
                    pred_answer_decoded.append("NULL ANSWER") # Disini, jawaban kosong
                    gold_answer_decoded.append(gold_answer)
    
    # Buat DataFrame QAS
    qas_df = pd.DataFrame({'Context': context_decoded, 
                           'Question': question_decoded, 
                           'Prediction Answer': pred_answer_decoded,
                          'Gold Answer': gold_answer_decoded})
    
    assert len(predict_result.predictions[0]) == len(qas_df), "Jumlah prediksi berbeda dengan jumlah evaluasi"
    
    # Return DataFrame QAS
    return qas_df

In [40]:
eval_result = evaluation(predict_result)

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.46it/s]


In [41]:
eval_result

Unnamed: 0,Context,Question,Prediction Answer,Gold Answer
0,kolumbus bukanlah orang pertama yang tiba di a...,siapakah yang menemuka benua amerika?,NULL ANSWER,orang - orang viking dari eropa utara
1,kabupaten donggala ( english : donggala regenc...,dimanakah letak donggala?,administrasi terletak di kota donggala. kabupa...,"provinsi sulawesi tengah, indonesia"
2,awal mula teknik industri dapat ditelusuri dar...,siapa bapak teknik industri?,NULL ANSWER,frederick winslow taylor
3,penghulu rasyid ( lahir di desa telaga itar ta...,kapan penghulu rasyid meninggal?,NULL ANSWER,15 desember 1861
4,samudra pasifik atau lautan teduh ( dari bahas...,seberapa luas kah samudera pasifik?,NULL ANSWER,"179, 7 juta km²"
5,"geisha ( bahasa jepang : 者 "" seniman "" ) adala...",apakah yang dimaksud denga geisha?,"ke - 18 dan abad ke - 19, dan masih ada sampai...",seniman - penghibur ( entertainer ) tradisiona...
6,pada periode 2000 - an bca memperkuat dan meng...,kapan bank bca mengeluarkan kartu debit?,NULL ANSWER,2000 - an
7,"general motors company ( nyse : ), juga dikena...",dimana kantor pusat general motors?,NULL ANSWER,"renaissance center di detroit, michigan, ameri..."
8,"selanjutnya, berdasarkan ketentuan dalam undan...",berapa luas kota blitar?,NULL ANSWER,"32, 58 km²"
9,"harumichi bouya ( 春 道, harumichi boya ), atau ...",siapa yang menciptakan serial manga crows?,"crows? harumichi bouya ( 春 道, harumichi boya )...",hiroshi takahashi


In [42]:
def compute_metrics_from_df(df):
    
    denominator = len(df)
    total_correct = 0
    f1_array = []
    precision_array = []
    recall_array = []

    for i in range(len(df)):
        
        pred_text = df["Prediction Answer"][i]
        gold_text = df["Gold Answer"][i]

        if pred_text == gold_text:
            total_correct += 1

        f1, precision, recall = compute_f1_prec_rec(pred=pred_text, gold=gold_text)

        f1_array.append(f1)
        precision_array.append(precision)
        recall_array.append(recall)

    exact_match = ((total_correct / denominator) * 100.0)
    final_f1 = np.mean(f1_array) * 100.0
    final_precision = np.mean(precision_array) * 100.0
    final_recall = np.mean(recall_array) * 100.0

    return {'exact_match': exact_match, 'f1': final_f1, 'precision': final_precision, 'recall': final_recall}

In [43]:
metric_result_after_filtering = compute_metrics_from_df(eval_result)
metric_result_after_filtering

{'exact_match': 0.0,
 'f1': 1.2389298465247829,
 'precision': 0.6461036311709506,
 'recall': 17.0}

In [95]:
def convert_to_non_zero(number):
    if number == 0:
        number += sys.float_info.min
    return number

def diff_metric(metric_result_before, metric_result_after, metric):
    
    percentage = round(((metric_result_after - metric_result_before) / metric_result_before) * 100, 2)
    
    if metric_result_before ==  metric_result_after:
        print(f"Hasil metrik {metric} sebelum filtering NLI SAMA DENGAN metrik setelah filtering NLI")
    elif metric_result_before <  metric_result_after:
        print(f"Hasil metrik {metric} setelah filtering NLI mengalami KENAIKAN sebesar: {percentage} %")
    elif metric_result_before >  metric_result_after:
        print(f"Hasil metrik {metric} setelah filtering NLI mengalami PENURUNAN sebesar: {-1 * percentage} %")

In [96]:
def compare_metrics(metrics_before, metrics_after):
    
    em_before = metrics_before['exact_match']
    f1_before = metrics_before['f1']
    prec_before = metrics_before['precision']
    rec_before = metrics_before['recall']
    
    print(f"Skor Exact Match sebelum filtering NLI: {em_before}")
    print(f"Skor F1 sebelum filtering NLI: {f1_before}")
    print(f"Skor Precision sebelum filtering NLI: {prec_before}")
    print(f"Skor Recall sebelum filtering NLI: {rec_before}")
    print()
    
    em_after = metrics_after['exact_match']
    f1_after = metrics_after['f1']
    prec_after = metrics_after['precision']
    rec_after = metrics_after['recall']
    
    print(f"Skor Exact Match setelah filtering NLI: {em_after}")
    print(f"Skor F1 setelah filtering NLI: {f1_after}")
    print(f"Skor Precision setelah filtering NLI: {prec_after}")
    print(f"Skor Recall setelah filtering NLI: {rec_after}")
    print()
    
    em_before = convert_to_non_zero(em_before)
    f1_before = convert_to_non_zero(f1_before)
    prec_before = convert_to_non_zero(prec_before)
    rec_before = convert_to_non_zero(rec_before)
    
    em_after = convert_to_non_zero(em_after)
    f1_after = convert_to_non_zero(f1_after)
    prec_after = convert_to_non_zero(prec_after)
    rec_after = convert_to_non_zero(rec_after)
  
    print("Persentase perubahan hasil metrik:")
    diff_metric(em_before, em_after, "Exact Match")
    diff_metric(f1_before, f1_after, "F1")
    diff_metric(prec_before, prec_after, "Precision")
    diff_metric(rec_before, rec_after, "Recall")

In [97]:
compare_metrics(metric_result_before_filtering, metric_result_after_filtering)

Skor Exact Match sebelum filtering NLI: 0.0
Skor F1 sebelum filtering NLI: 2.6614113018379757
Skor Precision sebelum filtering NLI: 1.4296741925116505
Skor Recall sebelum filtering NLI: 26.666666666666668

Skor Exact Match setelah filtering NLI: 0.0
Skor F1 setelah filtering NLI: 1.2389298465247829
Skor Precision setelah filtering NLI: 0.6461036311709506
Skor Recall setelah filtering NLI: 17.0

Persentase perubahan hasil metrik:
Hasil metrik Exact Match sebelum filtering NLI SAMA DENGAN metrik setelah filtering NLI
Hasil metrik F1 setelah filtering NLI mengalami PENURUNAN sebesar: 53.45 %
Hasil metrik Precision setelah filtering NLI mengalami PENURUNAN sebesar: 54.81 %
Hasil metrik Recall setelah filtering NLI mengalami PENURUNAN sebesar: 36.25 %


In [98]:
# TODO: tempel smoothing di evaluation()