# Menjalankan QA Tanpa Intermediate Task - Transfer Learning

# Import semua module

In [1]:
#!pip install datasets
#!pip install transformers
#!pip install tensorboard
#!pip install evaluate
#!pip install git+https://github.com/IndoNLP/nusa-crowd.git@release_exp

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [3]:
#!pip install --upgrade pip
!pip install -r requirements.txt --use-deprecated=legacy-resolver

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
# Melihat GPU yang tersedia dan penggunaannya.
!nvidia-smi

Fri Apr 14 10:05:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   59C    P0   257W / 300W |  26259MiB / 32510MiB |     40%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   43C    P0    44W / 300W |      0MiB / 32510MiB |      0%      Default |
|       

In [5]:
# Memilih GPU yang akan digunakan (contohnya: GPU #7)
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [6]:
import transformers
import evaluate
import torch
import operator
import ast
import json
import re
import sys
import contextlib

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from torch.utils.data import DataLoader
from datetime import datetime
from huggingface_hub import notebook_login
from tqdm import tqdm

from datasets import (
    load_dataset, 
    load_from_disk,
    Dataset,
    DatasetDict
)
from transformers import (
    BigBirdTokenizerFast,
    BigBirdForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BertForSequenceClassification,
    BertForQuestionAnswering,
    AutoModel, 
    BertTokenizerFast,
    AutoTokenizer, 
    AutoModel, 
    BertTokenizer, 
    BertForPreTraining,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    EvalPrediction,
    AutoModel,
    BertModel
)

# Definisikan hyperparameter

In [7]:
#MODEL_NAME = "indolem/indobert-base-uncased"
#MODEL_NAME = "afaji/fine-tuned-IndoNLI-Translated-with-indobert-base-uncased"
MODEL_NAME = "afaji/fine-tuned-DatasetQAS-TYDI-QA-ID-with-indobert-base-uncased-with-ITTL-without-freeze-LR-1e-05"
#MODEL_NAME = "indobenchmark/indobert-large-p2"
SEED = 42
EPOCH = 1
BATCH_SIZE = 16
GRADIENT_ACCUMULATION = 4
LEARNING_RATE = 1e-5
MAX_LENGTH = 400
STRIDE = 100
LOGGING_STEPS = 50
WARMUP_RATIO = 0.06
WEIGHT_DECAY = 0.01
MAXIMUM_SEARCH_ITER =  2
# Untuk mempercepat training, saya ubah SAMPLE menjadi 100.
# Bila mau menggunakan keseluruhan data, gunakan: 
SAMPLE = sys.maxsize
# SAMPLE = 50

# Import dataset QAS

In [8]:
#!pip install git+https://github.com/IndoNLP/nusa-crowd.git

In [9]:
#!pip install torch==1.7.0 torchvision==0.8.1 -f https://download.pytorch.org/whl/cu101/torch_stable.html

In [10]:
#!pip uninstall 

In [11]:
import torch

print(torch.version.cuda)

11.7


In [12]:
#!pip install tensorflow

In [13]:
#import tensorflow as tf
#from tensorflow.python.platform import build_info as build
#print(f"tensorflow version: {tf.__version__}")
#print(f"Cuda Version: {build.build_info['cuda_version']}")
#print(f"Cudnn version: {build.build_info['cudnn_version']}")

In [14]:
#!pip list

In [15]:
#!nvcc --version

In [16]:
#!pip uninstall torch
#!pip install torch

In [17]:
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2

In [18]:
#!nvidia-smi

In [19]:
#!pip uninstall -y torch

In [20]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

In [21]:
#!pip install cudatoolkit==11.5

In [22]:
#!pip install conda

In [23]:
#!conda install cudatoolkit=11.5

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
torch._C._cuda_getDeviceCount()

1

In [26]:
torch.cuda.is_available()

True

In [27]:
device

device(type='cuda')

In [28]:
print(torch.__version__)

2.0.0+cu117


# Definisikan tokenizer

In [29]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Definisikan fungsi pre-processnya

In [30]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_validation = pd.DataFrame(data_qas_id['validation'])

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)

data_qas_id = DatasetDict({"train": train_dataset, "validation": validation_dataset})



  0%|          | 0/3 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████| 358/358 [00:01<00:00, 200.27it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3659/3659 [00:23<00:00, 154.30it/s]


In [31]:
def rindex(lst, value, operator=operator):
      return len(lst) - operator.indexOf(reversed(lst), value) - 1

def preprocess_function_qa(examples, tokenizer, MAX_LENGTH=MAX_LENGTH, STRIDE=STRIDE, rindex=rindex, operator=operator):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]

    tokenized_examples = tokenizer(
      examples['question'],
      examples['context'],
      truncation=True,
      max_length = MAX_LENGTH,
      stride=STRIDE,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length",
      return_tensors='np'
    )

    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []

    for seq_idx in range(len(tokenized_examples['input_ids'])):
        seq_ids = tokenized_examples.sequence_ids(seq_idx)
        offset_mappings = tokenized_examples['offset_mapping'][seq_idx]

        cur_example_idx = tokenized_examples['overflow_to_sample_mapping'][seq_idx]

        #answer = examples['answer'][seq_idx][0]
        answer = examples['answer'][cur_example_idx]
        answer = eval(str(answer))
        #answer_text = answer['text'][0]
        answer_start = answer['answer_start']
        #answer_end = answer_start + len(answer_text)
        answer_end = answer['answer_end']

        context_pos_start = seq_ids.index(1)
        context_pos_end = rindex(seq_ids, 1, operator)

        s = e = 0
        if (offset_mappings[context_pos_start][0] <= answer_start and
            offset_mappings[context_pos_end][1] >= answer_end):
          i = context_pos_start
          while offset_mappings[i][0] < answer_start:
            i += 1
          if offset_mappings[i][0] == answer_start:
            s = i
          else:
            s = i - 1

          j = context_pos_end
          while offset_mappings[j][1] > answer_end:
            j -= 1      
          if offset_mappings[j][1] == answer_end:
            e = j
          else:
            e = j + 1

        tokenized_examples['start_positions'].append(s)
        tokenized_examples['end_positions'].append(e)
    return tokenized_examples

# Mulai tokenisasi dan pre-process

In [32]:
tokenized_data_qas_id = data_qas_id.map(
    preprocess_function_qa,
    batched=True,
    remove_columns=data_qas_id['train'].column_names,
    num_proc=1,
    fn_kwargs={'tokenizer': tokenizer, 'MAX_LENGTH': MAX_LENGTH, 'STRIDE': STRIDE, 'rindex': rindex, 'operator': operator}
)



  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [33]:
tokenized_data_qas_id = tokenized_data_qas_id.remove_columns(["offset_mapping", 
                                            "overflow_to_sample_mapping"])

In [34]:
tokenized_data_qas_id.set_format("torch", columns=["input_ids", "token_type_ids"], output_all_columns=True)

In [35]:
tokenized_data_qas_id_train = Dataset.from_dict(tokenized_data_qas_id["train"][:SAMPLE])
tokenized_data_qas_id_validation = Dataset.from_dict(tokenized_data_qas_id["validation"][:SAMPLE])

# Mendefinisikan argumen (dataops) untuk training nanti

In [36]:
TIME_NOW = str(datetime.now()).replace(":", "-").replace(" ", "_").replace(".", "_")
QA = './results/alur2-idk-mrc'
CHECKPOINT_DIR = f'{QA}-{TIME_NOW}/checkpoint/'
MODEL_DIR = f'{QA}-{TIME_NOW}/model/'
OUTPUT_DIR = f'{QA}-{TIME_NOW}/output/'
ACCURACY_DIR = f'{QA}-{TIME_NOW}/accuracy/'

# Mendefinisikan Training Arguments untuk train

In [37]:
training_args_qa = TrainingArguments(
        
    # Checkpoint
    output_dir=CHECKPOINT_DIR,
    overwrite_output_dir=True,
    save_strategy='steps',
    save_total_limit=EPOCH,

    # Log
    report_to='tensorboard',
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=LOGGING_STEPS,

    # Train
    num_train_epochs=EPOCH,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    bf16=False,
    dataloader_num_workers=cpu_count(),

    # Miscellaneous
    evaluation_strategy='steps',
    save_steps=int((tokenized_data_qas_id_train.num_rows / (BATCH_SIZE * GRADIENT_ACCUMULATION)) * 0.5),
    eval_steps=int((tokenized_data_qas_id_train.num_rows / (BATCH_SIZE * GRADIENT_ACCUMULATION)) * 0.5),
    seed=SEED,
    #hub_token=HUB_TOKEN,
    #push_to_hub=True,
    #hub_model_id=REPO_NAME,
    #load_best_model_at_end=True,
    #metric_for_best_model='f1',
)

# Pendefinisian model Question Answering

In [38]:
model_qa = BertForQuestionAnswering.from_pretrained(MODEL_NAME)

In [39]:
model_qa = model_qa.to(device)

# Melakukan pengumpulan data dengan padding

In [40]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Mulai training untuk fine-tune SQUAD diatas IndoBERT

In [41]:
import string
import collections

# # Melakukan evaluasi dari prediksi
def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1_prec_rec(pred, gold):
    pred_tokens = normalize_text(pred).split() # True positive + False positive = Untuk precision
    gold_tokens = normalize_text(gold).split() # True positive + False negatives = Untuk recall
    common = collections.Counter(pred_tokens) & collections.Counter(gold_tokens)
    num_same = sum(common.values()) # True positive

    if len(gold_tokens) == 0 or len(pred_tokens) == 0: 
        return int(gold_tokens == pred_tokens)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(gold_tokens)
    f1 = (2.0 * precision * recall) / (precision + recall)

    return f1

def compute_metrics(predict_result):
    predictions_idx = np.argmax(predict_result.predictions, axis=2)
    denominator = len(predictions_idx[0])
    label_array = np.asarray(predict_result.label_ids)
    total_correct = 0
    f1_array = []
    precision_array = []
    recall_array = []

    for i in range(len(predict_result.predictions[0])):
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1

        pred_text = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                    [start_pred_idx: end_pred_idx])
        gold_text = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                    [start_gold_idx: end_gold_idx])

        if pred_text == gold_text:
            total_correct += 1

        f1 = compute_f1_prec_rec(pred=pred_text, gold=gold_text)

        f1_array.append(f1)

    exact_match = ((total_correct / denominator) * 100.0)
    final_f1 = np.mean(f1_array) * 100.0

    return {'exact_match': exact_match, 'f1': final_f1}

In [42]:
trainer_qa = Trainer(
    model=model_qa,
    args=training_args_qa,
    train_dataset=tokenized_data_qas_id_train,
    eval_dataset=tokenized_data_qas_id_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [43]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_validation = pd.DataFrame(data_qas_id['validation'])
df_test = pd.DataFrame(data_qas_id['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)        

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    for j in df_test["qas"][i]:
        if len(j['answers']) != 0:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas_id = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})



  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 3659/3659 [00:22<00:00, 164.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 358/358 [00:01<00:00, 201.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 378/378 [00:01<00:00, 234.52it/s]


In [44]:
data_qas_id

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9332
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 764
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 844
    })
})

In [45]:
trainer_qa.train()



Step,Training Loss,Validation Loss,Exact Match,F1
73,2.1855,1.190539,46.465969,50.663675
146,1.3058,1.115683,49.60733,53.467761


TrainOutput(global_step=147, training_loss=1.584446754585318, metrics={'train_runtime': 365.0544, 'train_samples_per_second': 25.84, 'train_steps_per_second': 0.403, 'total_flos': 1920531162009600.0, 'train_loss': 1.584446754585318, 'epoch': 1.0})

# Menyimpan model Question Answering

In [46]:
trainer_qa.save_model(MODEL_DIR)

# Melakukan prediksi dari model

In [47]:
predict_result = trainer_qa.predict(tokenized_data_qas_id_validation)
predict_result

PredictionOutput(predictions=(array([[-6.435914  , -6.9685993 , -6.298404  , ..., -9.324049  ,
        -9.257196  , -9.116114  ],
       [-4.8937144 , -4.2243876 , -1.7159504 , ..., -8.833822  ,
        -8.811077  , -8.838762  ],
       [-4.718739  , -4.789043  , -0.86703014, ..., -8.805774  ,
        -8.817287  , -8.819923  ],
       ...,
       [-3.012759  , -4.7193727 , -7.117803  , ..., -7.823757  ,
        -7.709506  , -7.9639964 ],
       [-3.542631  , -4.5105743 , -7.0690026 , ..., -6.9549575 ,
        -6.9480996 , -7.521445  ],
       [-3.6498957 , -4.6104627 , -7.0684705 , ..., -6.96615   ,
        -6.959043  , -7.6362724 ]], dtype=float32), array([[-4.8501034 , -6.4872174 , -7.496827  , ..., -7.904192  ,
        -7.8175154 , -7.897848  ],
       [-2.5316944 , -3.6562064 , -3.9014554 , ..., -7.812762  ,
        -7.800026  , -7.813285  ],
       [-2.9194224 , -3.3534932 , -3.4802475 , ..., -7.994805  ,
        -7.988219  , -8.050567  ],
       ...,
       [-0.39873138, -4.56907

In [48]:
os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
with open(f'{OUTPUT_DIR}/output.txt', "w") as f:
  f.write(str(predict_result))
  f.close()

# Melakukan evaluasi dari prediksi

In [49]:
metric_result_before_filtering = compute_metrics(predict_result)
metric_result_before_filtering

{'exact_match': 49.60732984293193, 'f1': 53.46776111438457}

In [50]:
os.makedirs(os.path.dirname(ACCURACY_DIR), exist_ok=True)
with open(f'{ACCURACY_DIR}/accuracy.txt', "w") as f:
  f.write(str(metric_result_before_filtering))
  f.close()

## Coba Alur 2

In [51]:
from tqdm import tqdm

def create_qas_dataframe(predict_result=predict_result, index_largest=1):
    predictions_idx = np.argsort(predict_result.predictions, axis=2)[:, :, index_largest * -1]
    #predictions_idx = np.argmax(predict_result.predictions, axis=2)
    label_array = np.asarray(predict_result.label_ids)
    question_decoded = []
    context_decoded = []
    pred_answer_decoded = []
    gold_answer_decoded = []
    
    for i in tqdm(range(len(predict_result.predictions[0]))):
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        pred_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_pred_idx: end_pred_idx], skip_special_tokens=True)
        pred_answer_decoded.append(pred_answer)
        
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1
        gold_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_gold_idx: end_gold_idx], skip_special_tokens=True)
        gold_answer_decoded.append(gold_answer)
        
        question = []
        context = []
         
        for j in range(len(tokenized_data_qas_id_validation[i]['token_type_ids'])):
            if tokenized_data_qas_id_validation[i]['token_type_ids'][j] == 0:
                question.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
            else:
                context.append(tokenized_data_qas_id_validation[i]['input_ids'][j])

        question_decoded.append(tokenizer.decode(question, skip_special_tokens=True))
        context_decoded.append(tokenizer.decode(context, skip_special_tokens=True))
    
    qas_df = pd.DataFrame({'Context': context_decoded, 
                           'Question': question_decoded, 
                           'Prediction Answer': pred_answer_decoded,
                          'Gold Answer': gold_answer_decoded})
                      
    return qas_df

In [52]:
qas_df = create_qas_dataframe(predict_result)
qas_df

100%|█████████████████████████████████████████████████████████████████████████████████| 764/764 [08:43<00:00,  1.46it/s]


Unnamed: 0,Context,Question,Prediction Answer,Gold Answer
0,sistem pemosisi global [ 1 ] ( bahasa inggris ...,apa kepanjangan dari gps?,,global positioning system
1,"ukuran reptil bervariasi, dari yang berukuran ...",apakah cabang ilmu pengetahuan alam yang mempe...,herpetologi,herpetologi
2,"ukuran reptil bervariasi, dari yang berukuran ...",apa cabang ilmu pengetahuan alam yang tidak me...,herpetologi,
3,"reptil ( binatang melata, atau dalam bahasa la...",apakah maksud reptil dalam bahasa latin?,kelompok hewan vertebrata berdarah dingin dan ...,' melata'atau'merayap '
4,"reptil ( binatang melata, atau dalam bahasa la...",apakah maksud reptil ganas dalam bahasa latin?,"reptil ( binatang melata, atau dalam bahasa la...",
...,...,...,...,...
759,"realme meluncurkan smartphone pertamanya "" rea...",apakah smartphone pertama yang diproduksi realme?,,realme 1
760,thariq bin ziyad lebih banyak dikenal sebagai ...,siapa yang dikenal sebagai penakluk spanyol?,thariq bin ziyad,thariq bin ziyad
761,thariq bin ziyad lebih banyak dikenal sebagai ...,siapa yang dikenal sebagai penakluk spanyol pa...,thariq bin ziyad,
762,adalah angkatan laut kekaisaran jepang ( kaigu...,siapa yang menggunakan kapal induk secara efek...,,angkatan laut kekaisaran jepang ( kaigun )


In [53]:
question_mark = ['siapa', 'siapakah',
                    'apa', 'apakah', 'adakah',
                    'dimana', 'dimanakah', 'darimanakah',
                    'kapan', 'kapankah',
                    'bagaimana', 'bagaimanakah',
                    'kenapa', 'mengapa',
                    'berapa', 'berapakah', 'seberapa']

## Mencoba cara retrieve model dari HF

In [54]:
#!pip install torch torchvision torchaudio

In [55]:
torch.cuda.is_available()

True

In [56]:
#!nvidia-smi

In [57]:
from transformers import pipeline

pretrained_name_sc = "afaji/fine-tuned-IndoNLI-Augmented-with-indobert-base-uncased"
pretrained_name_qa = "afaji/fine-tuned-DatasetQAS-TYDI-QA-ID-with-indobert-base-uncased-with-ITTL-without-freeze-LR-1e-05"
tokenizer_kwargs = {'padding': True, 'truncation': True, 'max_length': MAX_LENGTH}

nlp_sc = pipeline(task="text-classification", model=pretrained_name_sc, tokenizer=pretrained_name_sc, 
                   **tokenizer_kwargs)
nlp_qa = pipeline(task="question-answering", model=pretrained_name_qa, tokenizer=pretrained_name_qa, 
                   **tokenizer_kwargs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/737k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [99]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pretrained_name_ind_tg = "Wikidepia/IndoT5-base-paraphrase"
pretrained_name_eng_tg = "humarin/chatgpt_paraphraser_on_T5_base"

tokenizer_kwargs = {'truncation': True, 'max_length': MAX_LENGTH}

nlp_tg = pipeline(task="text2text-generation", model=pretrained_name_ind_tg, tokenizer=pretrained_name_ind_tg, **tokenizer_kwargs)
nlp_tg_eng = pipeline(task="text2text-generation", model=pretrained_name_eng_tg, tokenizer=pretrained_name_eng_tg, **tokenizer_kwargs)

## Coba buat method evaluasi

In [105]:
def question_type(data):
    num_apa = pd.DataFrame()
    num_dimana = pd.DataFrame()
    num_kapan = pd.DataFrame()
    num_siapa = pd.DataFrame()
    num_bagaimana = pd.DataFrame()
    num_kenapa = pd.DataFrame()
    num_berapa = pd.DataFrame()
    num_others = pd.DataFrame()
    
    for i in data['question']:
        current_question = i.split()
        
        if 'Apa' in current_question: num_apa += 1
        elif 'Apakah' in current_question: num_apa += 1
        elif 'apa' in current_question: num_apa += 1
        elif 'apakah' in current_question: num_apa += 1
        
        elif 'Dimana' in current_question: num_dimana += 1
        elif 'dimana' in current_question: num_dimana += 1
        elif 'mana' in current_question: num_dimana += 1
        
        elif 'Kapan' in current_question: num_kapan += 1
        elif 'kapan' in current_question: num_kapan += 1
        
        elif 'Siapa' in current_question: num_siapa += 1
        elif 'siapa' in current_question: num_siapa += 1
        
        elif 'Bagaimana' in current_question: num_bagaimana += 1
        elif 'bagaimana' in current_question: num_bagaimana += 1

        elif 'Mengapa' in current_question: num_kenapa += 1
        elif 'Kenapa' in current_question: num_kenapa += 1
        elif 'mengapa' in current_question: num_kenapa += 1
        elif 'kenapa' in current_question: num_kenapa += 1
        
        elif 'Berapa' in current_question: num_berapa += 1
        elif 'Berapakah' in current_question: num_berapa += 1
        elif 'berapa' in current_question: num_berapa += 1
        elif 'berapakah' in current_question: num_berapa += 1
        
        else: num_others += 1

Context
Question
Prediction Answer
Gold Answer


In [132]:
# Cek rule based, mengikuti context

def three_print(data=qas_df):
    print(f"Context: {data['Context'][i]}")
    print(f"Question: {data['Question'][i]}")
    print(f"Answer: {data['Gold Answer'][i]}")
    pred_hypothesis, gold_hypothesis = smoothing(data['Question'][i], data['Prediction Answer'][i], data['Gold Answer'][i], 
                                                 type="rule based")
    print(f"Hypothesis: {gold_hypothesis}")
    print()

for i in range(len(qas_df['Question'])):
    
    current_question = qas_df['Question'][i].split()
        
    if 'Bagaimana' in current_question: three_print()
    elif 'bagaimana' in current_question: three_print()

Context: hari bastille adalah nama lain untuk hari nasional perancis yang dirayakan tanggal 14 juli setiap tahunnya. di perancis, nama resminya adalah la fete nationale ( perayaan nasional ) dan umumnya le quatorze juillet ( empat belas juli ). hari ini merayakan fete de la federation 1790 yang diadakan pada ulang tahun pertama penyerbuan bastille tanggal 14 juli 1789 ; ulang tahun penyerbuan benteng penjara bastille dipandang sebagai simbol pemberontakan bangsa yang modern ini, dan rekonsiliasi seluruh rakyat perancis di dalam kekuasaan monarki konstitusional sebelum republik pertama. pesta dan upacara resmi diselenggarakan di seluruh perancis. parade militer rutin tertua dan terbesar di eropa diadakan pada pagi 14 juli, di jalan champs - elysees, paris, di hadapan presiden republik, pejabat pemerintahan perancis, dan perwakilan asing. [ 1 ] [ 2 ]
Question: bagaimana hari kemerdekaan prancis dirayakan?
Answer: 
Hypothesis: hari kemerdekaan prancis dirayakan adalah 

Context: musik gos

In [129]:
def smoothing(question, pred_answer, gold_answer, type, question_mark=question_mark):
    
    if type == 'replace first':
        pred_hypothesis = question.replace('?', '')
        pred_hypothesis = pred_hypothesis.replace(question.split()[0], pred_answer)

        gold_hypothesis = question.replace('?', '')
        gold_hypothesis = gold_hypothesis.replace(question.split()[0], gold_answer)
    
    elif type == 'replace question mark':
        for i in question.split():
            if i in question_mark:
                pred_hypothesis = question.replace('?', '')
                pred_hypothesis = pred_hypothesis.replace(i, pred_answer)

                gold_hypothesis = question.replace('?', '')
                gold_hypothesis = gold_hypothesis.replace(i, gold_answer)
    
    elif type == 'add adalah':
        pred_hypothesis = question.replace('?', '')
        pred_hypothesis = pred_hypothesis.replace(question.split()[0], '')
        pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

        gold_hypothesis = question.replace('?', '')
        gold_hypothesis = gold_hypothesis.replace(question.split()[0], '')
        gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"
    
    elif type == 'just concat answer and question':
        pred_hypothesis = f"{question} {pred_answer}"         
        gold_hypothesis = f"{question} {gold_answer}"
        
    elif type == 'rule based':
        question = question.replace('kah', '')
        for j in question.split():
            if j in question_mark:
                if j == 'siapa' or j == 'siapakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_answer} merupakan {pred_hypothesis}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_answer} merupakan {gold_hypothesis}"

                elif j == 'apa' or j == 'apakah' or j == 'adakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"

                elif j == 'dimana' or j == 'dimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} di {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} di {gold_answer}"

                elif j == 'darimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} dari {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} dari {gold_answer}"

                elif j == 'kapan' or j == 'kapankah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} pada {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} pada {gold_answer}"

                elif j == 'bagaimana' or j == 'bagaimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '')
                    pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"

                elif j == 'kenapa' or j == 'mengapa':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, 'alasan').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} adalah karena {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, 'alasan').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah karena {gold_answer}"

                elif j == 'berapa' or j == 'berapakah' or j == 'seberapa': 
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    
                    if 'luas' in pred_hypothesis.split():
                        pred_hypothesis = pred_hypothesis.replace('luas', '')
                        pred_hypothesis = f"{pred_hypothesis} memiliki luas {pred_answer}"
                    
                    elif 'jumlah'in pred_hypothesis.split():
                        pred_hypothesis = pred_hypothesis.replace('jumlah', '')
                        pred_hypothesis = f"{pred_hypothesis} berjumlah {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    
                    if 'luas' in gold_hypothesis.split():
                        gold_hypothesis = gold_hypothesis.replace('luas', '')
                        gold_hypothesis = f"{gold_hypothesis} memiliki luas {gold_answer}"
                    
                    elif 'jumlah'in gold_hypothesis.split():
                        gold_hypothesis = gold_hypothesis.replace('jumlah', '')
                        gold_hypothesis = f"{gold_hypothesis} berjumlah {gold_answer}"
                    
    elif type == 'machine generation': 
        pred_hypothesis, gold_hypothesis = smoothing(question, pred_answer, gold_answer, type="rule based")
        pred_hypothesis = nlp_tg(pred_hypothesis)[0]['generated_text']
        gold_hypothesis = nlp_tg(gold_hypothesis)[0]['generated_text']
    
    elif type == 'machine generation with translation':
        pred_hypothesis, gold_hypothesis = smoothing(question, pred_answer, gold_answer, type="rule based")

        pred_hypothesis = GoogleTranslator(source='id', target='en').translate(pred_hypothesis)
        gold_hypothesis = GoogleTranslator(source='id', target='en').translate(gold_hypothesis)
        
        pred_hypothesis = nlp_tg_eng(pred_hypothesis)[0]['generated_text']
        gold_hypothesis = nlp_tg_eng(gold_hypothesis)[0]['generated_text']
        
        pred_hypothesis = GoogleTranslator(source='en', target='id').translate(pred_hypothesis)
        gold_hypothesis = GoogleTranslator(source='en', target='id').translate(gold_hypothesis)
        
    return pred_hypothesis, gold_hypothesis

In [103]:
pred_hypothesis, gold_hypothesis = smoothing("kenapa messi menang?", "mungkin karena jago", "tim yang unggul",
                                            type="machine generation with translation")
print(pred_hypothesis)
print(gold_hypothesis)

Kemenangan Messi mungkin karena kemahirannya di area tersebut, yang mungkin menjelaskan mengapa dia menang.
Keunggulan tim messi dalam tim menjadi alasan di balik kemenangan tersebut, demikian disampaikan sang pelatih.


In [61]:
def create_df_for_evaluation(predict_result, type_smoothing, type_qas, MAXIMUM_SEARCH_ITER=MAXIMUM_SEARCH_ITER):
    
    # Ekstrak dari PredictionOutput QAS
    predictions_idx = np.argsort(predict_result.predictions, axis=2)[:, :, 1 * -1]
    label_array = np.asarray(predict_result.label_ids)
    
    question_array = []
    context_array = []
    
    pred_answer_before_filtering_array = []
    pred_answer_after_filtering_array = []
    
    label_before_filtering_array = []
    label_after_filtering_array = []
    
    pred_hypothesis_before_filtering_array = []
    pred_hypothesis_after_filtering_array = []
    
    gold_answer_array = []
    gold_hypothesis_array = []
    
    # Iterasi ini ditujukan untuk retrieve answer
    for i in tqdm(range(len(predict_result.predictions[0]))):
        
        isFoundBiggest = False
        
        start_pred_idx = predictions_idx[0][i]
        end_pred_idx = predictions_idx[1][i] + 1
        
        start_gold_idx = label_array[0][i]
        end_gold_idx = label_array[1][i] + 1
        
        # Retrieve answer prediksi
        pred_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_pred_idx: end_pred_idx], skip_special_tokens=True)
        
        # Retrieve answer gold
        gold_answer = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                       [start_gold_idx: end_gold_idx], skip_special_tokens=True)
        
        question = []
        context = []
        
        # Iterasi ini untuk retrieve question dan context index yang bersangkutan
        for j in range(len(tokenized_data_qas_id_validation[i]['token_type_ids'])):
            
            # Bila token_type_ids-nya 0, maka itu question (sesuai dengan urutan tokenisasi)
            if tokenized_data_qas_id_validation[i]['token_type_ids'][j] == 0:
                question.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
            
            # Bila token_type_ids-nya 1, maka itu context (sesuai dengan urutan tokenisasi)
            else:
                context.append(tokenized_data_qas_id_validation[i]['input_ids'][j])
        
        question_decoded = tokenizer.decode(question, skip_special_tokens=True)
        context_decoded = tokenizer.decode(context, skip_special_tokens=True)
        pred_hypothesis, gold_hypothesis = smoothing(question_decoded, pred_answer, gold_answer, type_smoothing)

        # Cek label dari answer prediksi dan context
        predicted_label = nlp_sc({'text': context_decoded, 
                                  'text_pair': pred_hypothesis}, 
                                 **tokenizer_kwargs)
        
        pred_answer_before_filtering_array.append([pred_answer])
        pred_hypothesis_before_filtering_array.append([pred_hypothesis])
        label_before_filtering_array.append([predicted_label])
        
        # Cek label dari answer prediksi dan context, bila labelnya entailment (atau neutral), maka answernya jadi hasil akhir
        if predicted_label['label'] == 'neutral':
            if type_qas == 'entailment or neutral':
                question_array.append(question_decoded)
                context_array.append(context_decoded)
                pred_answer_after_filtering_array.append([pred_answer])
                gold_answer_array.append(gold_answer)
                pred_hypothesis_after_filtering_array.append([pred_hypothesis])
                gold_hypothesis_array.append(gold_hypothesis)
                label_after_filtering_array.append([predicted_label])

        if predicted_label['label'] == 'entailment':
            if type_qas == 'entailment only' or type_qas == 'entailment or neutral':
                question_array.append(question_decoded)
                context_array.append(context_decoded)
                pred_answer_after_filtering_array.append([pred_answer])
                gold_answer_array.append(gold_answer)
                pred_hypothesis_after_filtering_array.append([pred_hypothesis])
                gold_hypothesis_array.append(gold_hypothesis)
                label_after_filtering_array.append([predicted_label])
            
        # Cek label dari answer prediksi dan context, bila labelnya bukan entailment (atau neutral), 
        # -- maka masuk ke for-loop untuk iterasi ke argmax selanjutnya, dengan menggunakan argsort
        else:
            
            if predicted_label == 'neutral' and type_qas == 'entailment or neutral': continue
            
            # Bila MAXIMUM_SEARCH_ITER dibawah 2, maka continue langsung
            if MAXIMUM_SEARCH_ITER < 2: continue

            # Bila MAXIMUM_SEARCH_ITER diatas 2, maka continue langsung
            
            else:
                # Bila bukan entailment, loop sebanyak MAXIMUM_SEARCH_ITER kali.
                pred_answer_after_filtering_array_msi_recorded = []
                pred_hypothesis_after_filtering_array_msi_recorded = []
                label_after_filtering_array_msi_recorded = []
                for index_largest in range(MAXIMUM_SEARCH_ITER - 1):
                    
                    #pred_answer_after_filtering_array_msi_recorded = []
                    #pred_hypothesis_after_filtering_array_msi_recorded = []
                    #label_after_filtering_array_msi_recorded = []

                    # Cari di index kedua, ketiga, keempat, dan seterusnya
                    predictions_idx_inside_loop = np.argsort(predict_result.predictions, 
                                                             axis=2)[:, :, (index_largest + 2) * -1]

                    start_pred_idx = predictions_idx_inside_loop[0][i]
                    end_pred_idx = predictions_idx_inside_loop[1][i] + 1

                    # Retrieve answer prediksi
                    pred_answer_inside_loop = tokenizer.decode(tokenized_data_qas_id_validation[i]['input_ids']
                                                   [start_pred_idx: end_pred_idx], skip_special_tokens=True)
                    
                    pred_hypothesis_inside_loop, gold_hypothesis = smoothing(
                        question_decoded, pred_answer_inside_loop, gold_answer, type_smoothing)
                    
                    # Cek label dari answer prediksi dan context
                    predicted_label_inside_loop = nlp_sc({'text': context_decoded, 
                                                          'text_pair': pred_hypothesis_inside_loop}
                                                           , **tokenizer_kwargs)
                    
                    pred_answer_after_filtering_array_msi_recorded.append(pred_answer_inside_loop)
                    pred_hypothesis_after_filtering_array_msi_recorded.append(pred_hypothesis_inside_loop)
                    label_after_filtering_array_msi_recorded.append(predicted_label_inside_loop)
                    
                    # Bila label-nya sudah entailment (atau neutral), maka answernya jadi hasil akhir, dan break
                    if type_qas == 'entailment only':
                        if predicted_label_inside_loop['label'] == 'entailment':
                            isFoundBiggest = True
                            question_array.append(question_decoded)
                            context_array.append(context_decoded)
                            gold_answer_array.append(gold_answer)   
                            gold_hypothesis_array.append(gold_hypothesis)
                            
                            pred_answer_after_filtering_array.append(pred_answer_after_filtering_array_msi_recorded)
                            pred_hypothesis_after_filtering_array.append(pred_hypothesis_after_filtering_array_msi_recorded)
                            label_after_filtering_array.append(label_after_filtering_array_msi_recorded)
                            break
                            
                    elif type_qas == 'entailment or neutral':
                        if predicted_label_inside_loop['label'] == 'entailment' or predicted_label_inside_loop['label'] == 'neutral':
                            isFoundBiggest = True
                            question_array.append(question_decoded)
                            context_array.append(context_decoded)
                            gold_answer_array.append(gold_answer)   
                            gold_hypothesis_array.append(gold_hypothesis)
                            
                            pred_answer_after_filtering_array.append(pred_answer_after_filtering_array_msi_recorded)
                            pred_hypothesis_after_filtering_array.append(pred_hypothesis_after_filtering_array_msi_recorded)
                            label_after_filtering_array.append(label_after_filtering_array_msi_recorded)
                            break

                if isFoundBiggest == False:
                    # Bila sampai iterasi terakhir, belum entailment (atau neutral) juga, maka append saja jawaban kosong
                    
                    pred_answer_not_found_biggest = "" # Disini, jawaban kosong
                    
                    question_array.append(question_decoded)
                    context_array.append(context_decoded)
                    
                    pred_hypothesis_not_found_biggest, gold_hypothesis = smoothing(
                        question_decoded, pred_answer_not_found_biggest, gold_answer, type_smoothing)
                    
                    pred_answer_after_filtering_array_msi_recorded.append(pred_answer_not_found_biggest)
                    pred_hypothesis_after_filtering_array_msi_recorded.append(pred_hypothesis_not_found_biggest)
                    label_after_filtering_array_msi_recorded.append(predicted_label_inside_loop)
                    
                    gold_answer_array.append(gold_answer)
                    gold_hypothesis_array.append(gold_hypothesis)
                    
                    pred_answer_after_filtering_array.append(pred_answer_after_filtering_array_msi_recorded)
                    pred_hypothesis_after_filtering_array.append(pred_hypothesis_after_filtering_array_msi_recorded)
                    label_after_filtering_array.append(label_after_filtering_array_msi_recorded)
    
    # Buat DataFrame QAS
    qas_df = pd.DataFrame({'Context': context_array, 
                           'Question': question_array, 
                           
                           'Prediction Answer Before Filtering': pred_answer_before_filtering_array,
                           'Prediction Hypothesis Before Filtering': pred_hypothesis_before_filtering_array,
                           'Label Before Filtering': label_before_filtering_array,
                                 
                           'Prediction Answer After Filtering': pred_answer_after_filtering_array,
                           'Prediction Hypothesis After Filtering': pred_hypothesis_after_filtering_array,
                           'Label After Filtering': label_after_filtering_array,
                          
                           'Gold Answer': gold_answer_array,
                          'Gold Hypothesis': gold_hypothesis_array})
                          
    assert len(predict_result.predictions[0]) == len(qas_df), "Jumlah prediksi berbeda dengan jumlah evaluasi"
    
    # Return DataFrame QAS
    return qas_df

In [62]:
eval_df = create_df_for_evaluation(predict_result, type_smoothing='replace first', type_qas='entailment only', MAXIMUM_SEARCH_ITER=2)
eval_df

  2%|█▍                                                                                | 13/764 [00:32<31:05,  2.48s/it]


KeyboardInterrupt: 

In [None]:
eval_df['Question'][2]

In [None]:
eval_df['Context'][2]

In [None]:
eval_df['Prediction Answer After Filtering'][2]

In [None]:
def compute_metrics_from_df(df, type_qas):
    
    denominator = len(df)
    total_correct = 0
    f1_array = []
    
    true_positive_before_filtering = 0
    false_positive_before_filtering = 0
    false_negative_before_filtering = 0
    true_negative_before_filtering = 0
    
    true_positive_after_filtering = 0
    false_positive_after_filtering = 0
    false_negative_after_filtering = 0
    true_negative_after_filtering = 0

    for i in range(len(df)):
        
        pred_answer_before_filtering = df["Prediction Answer Before Filtering"][i][-1]
        pred_answer_after_filtering = df["Prediction Answer After Filtering"][i][-1]
        
        pred_label_before_filtering = df["Label Before Filtering"][i][-1]['label']
        pred_label_after_filtering = df["Label After Filtering"][i][-1]['label']
        
        gold_text = df["Gold Answer"][i]

        if pred_answer_after_filtering == gold_text:
            total_correct += 1

        f1 = compute_f1_prec_rec(pred=pred_answer_after_filtering, gold=gold_text)

        f1_array.append(f1)
        
        # Terprediksi dengan label yang benar, dan hasil answernya benar -> True positive
        # Terprediksi dengan label yang benar, padahal hasil answernya salah -> False positive
        # Terprediksi dengan label yang salah, padahal hasil answernya benar -> False negative
        # Terprediksi dengan label yang salah, dan hasil answernya salah -> True negative
        
        if type_qas == 'entailment only':
        
            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering == 'entailment'):
                true_positive_after_filtering += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering == 'entailment'):
                false_positive_after_filtering += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering != 'entailment'):
                false_negative_after_filtering += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering != 'entailment'):
                true_negative_after_filtering += 1

            if (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'entailment'):
                true_positive_before_filtering += 1
            elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'entailment'):
                false_positive_before_filtering += 1
            elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering != 'entailment'):
                false_negative_before_filtering += 1
            elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering != 'entailment'):
                true_negative_before_filtering += 1
        
        elif type_qas == 'entailment or neutral':
        
            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering == 'entailment' 
                                                               or pred_label_after_filtering == 'neutral'):
                true_positive_after_filtering += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering == 'entailment' 
                                                                 or pred_label_after_filtering == 'neutral'):
                false_positive_after_filtering += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering != 'entailment' 
                                                                 and pred_label_after_filtering != 'neutral'):
                false_negative_after_filtering += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering != 'entailment' 
                                                                 and pred_label_after_filtering != 'neutral'):
                true_negative_after_filtering += 1

            if (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'entailment' 
                                                                or pred_label_after_filtering == 'neutral'):
                true_positive_before_filtering += 1
            elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'entailment' 
                                                                  or pred_label_after_filtering == 'neutral'):
                false_positive_before_filtering += 1
            elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering != 'entailment' 
                                                                  and pred_label_after_filtering != 'neutral'):
                false_negative_before_filtering += 1
            elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering != 'entailment' 
                                                                  and pred_label_after_filtering != 'neutral'):
                true_negative_before_filtering += 1

    exact_match = ((total_correct / denominator) * 100.0)
    final_f1 = np.mean(f1_array) * 100.0
    after_filtering_metric_array = [true_positive_after_filtering, false_positive_after_filtering, 
                          false_negative_after_filtering, true_negative_after_filtering]
    before_filtering_metric_array = [true_positive_before_filtering, false_positive_before_filtering, 
                          false_negative_before_filtering, true_negative_before_filtering]

    return {'exact_match': exact_match, 'f1': final_f1}, after_filtering_metric_array, before_filtering_metric_array

In [None]:
metric_result_after_filtering, after_filtering_metric_array, before_filtering_metric_array = compute_metrics_from_df(
    eval_df, "entailment only")
metric_result_after_filtering

In [None]:
def convert_to_non_zero(number):
    if number == 0:
        number += sys.float_info.min
    return number

def compute_f1_prec_rec_whole(metric_array):
    accuracy = (metric_array[0] + metric_array[3]) / \
        (metric_array[0] + metric_array[1] + 
         metric_array[2] + metric_array[3])
    
    precision = (metric_array[0]) / (metric_array[0] + metric_array[1])
    
    recall = (metric_array[0]) / (metric_array[0] + metric_array[2])
    
    f1 = (2 * precision * recall) / (precision + recall)
    
    return accuracy, precision, recall, f1

def diff_verbose_metric(metric_result_before, metric_result_after, metric):
    
    percentage = round(((metric_result_after - metric_result_before) / metric_result_before) * 100, 2)
    
    if '&' in metric: vocab = "nilai"
    else: vocab = "metrik"
    
    if metric_result_before ==  metric_result_after:
        print(f"Hasil {vocab} {metric} sebelum filtering NLI SAMA DENGAN metrik setelah filtering NLI")
    elif metric_result_before <  metric_result_after:
        print(f"Hasil {vocab} {metric} setelah filtering NLI mengalami KENAIKAN sebesar: {percentage} %")
    elif metric_result_before >  metric_result_after:
        print(f"Hasil {vocab} {metric} setelah filtering NLI mengalami PENURUNAN sebesar: {-1 * percentage} %")
    
    return percentage

In [None]:
def compare_metrics(metrics_before, metrics_after, 
                    after_filtering_metric_array=after_filtering_metric_array, 
                    before_filtering_metric_array=before_filtering_metric_array):
    
    em_before = metrics_before['exact_match']
    f1_before = metrics_before['f1']

    print("~ METRIK PER TOKEN ~")
    print(f"Skor Exact Match sebelum filtering NLI: {em_before}")
    print(f"Skor F1 sebelum filtering NLI: {f1_before}")
    print()

    em_after = metrics_after['exact_match']
    f1_after = metrics_after['f1']

    print(f"Skor Exact Match setelah filtering NLI: {em_after}")
    print(f"Skor F1 setelah filtering NLI: {f1_after}")
    print()

    em_before = convert_to_non_zero(em_before)
    f1_before = convert_to_non_zero(f1_before)

    em_after = convert_to_non_zero(em_after)
    f1_after = convert_to_non_zero(f1_after)

    print("~ METRIK DENGAN PARAMETER NLI ~")
    print(f"[BEFORE FILTERING] Jawaban benar & label NLI yang sesuai: {before_filtering_metric_array[0]}")
    print(f"[BEFORE FILTERING] Jawaban TIDAK benar & label NLI yang sesuai: {before_filtering_metric_array[1]}")
    print(f"[BEFORE FILTERING] Jawaban benar & label NLI yang TIDAK sesuai: {before_filtering_metric_array[2]}")
    print(f"[BEFORE FILTERING] Jawaban TIDAK benar & label NLI yang TIDAK sesuai: {before_filtering_metric_array[3]}")
    print()

    print(f"[AFTER FILTERING] Jawaban benar & label NLI yang sesuai: {after_filtering_metric_array[0]}")
    print(f"[AFTER FILTERING] Jawaban TIDAK benar & label NLI yang sesuai: {after_filtering_metric_array[1]}")
    print(f"[AFTER FILTERING] Jawaban benar & label NLI yang TIDAK sesuai: {after_filtering_metric_array[2]}")
    print(f"[AFTER FILTERING] Jawaban TIDAK benar & label NLI yang TIDAK sesuai: {after_filtering_metric_array[3]}")
    print()

    print("Metrik di atas, bisa direpresentasikan menjadi:")

    acc_before_whole, prec_before_whole, rec_before_whole, f1_before_whole = compute_f1_prec_rec_whole(
        before_filtering_metric_array)
    acc_after_whole, prec_after_whole, rec_after_whole, f1_after_whole = compute_f1_prec_rec_whole(
        after_filtering_metric_array)

    print(f"[BEFORE FILTERING] Akurasi: {acc_before_whole}")
    print(f"[BEFORE FILTERING] Precision: {prec_before_whole}")
    print(f"[BEFORE FILTERING] Recall: {rec_before_whole}")
    print(f"[BEFORE FILTERING] F1: {f1_before_whole}")
    print()

    print(f"[AFTER FILTERING] Akurasi: {acc_after_whole}")
    print(f"[AFTER FILTERING] Precision: {prec_after_whole}")
    print(f"[AFTER FILTERING] Recall: {rec_after_whole}")
    print(f"[AFTER FILTERING] F1: {f1_after_whole}")
    print()

    print("--- Persentase perubahan hasil metrik ---")
    print("~ METRIK PER TOKEN ~")
    diff_verbose_metric(em_before, em_after, "Exact Match")
    diff_verbose_metric(f1_before, f1_after, "F1")
    print()

    print("~ METRIK DENGAN PARAMETER NLI ~")
    diff_verbose_metric(before_filtering_metric_array[0], after_filtering_metric_array[0], 
                        "Jawaban benar & label NLI yang sesuai")
    diff_verbose_metric(before_filtering_metric_array[1], after_filtering_metric_array[1], 
                        "Jawaban TIDAK benar & label NLI yang sesuai")
    diff_verbose_metric(before_filtering_metric_array[2], after_filtering_metric_array[2], 
                        "Jawaban benar & label NLI yang TIDAK sesuai")
    diff_verbose_metric(before_filtering_metric_array[3], after_filtering_metric_array[3], 
                        "Jawaban TIDAK benar & label NLI yang TIDAK sesuai")
    print()

    print("Metrik di atas, bisa direpresentasikan menjadi:")
    diff_verbose_metric(acc_before_whole, acc_after_whole, "Akurasi")
    diff_verbose_metric(prec_before_whole, prec_after_whole, "Precision")
    diff_verbose_metric(rec_before_whole, rec_after_whole, "Recall")
    diff_verbose_metric(f1_before_whole, f1_after_whole, "F1")
    print()

In [None]:
compare_metrics(metric_result_before_filtering, metric_result_after_filtering)

In [None]:
os.makedirs(os.path.dirname(ACCURACY_DIR), exist_ok=True)
with open(f'{ACCURACY_DIR}/metric_comparison_results.txt', "w") as f, contextlib.redirect_stdout(f):
    compare_metrics(metric_result_before_filtering, metric_result_after_filtering)
    f.close()

In [None]:
eval_df["Context"][2]

In [None]:
eval_df

In [None]:
from IPython.display import display

In [None]:
def general_evaluation(df):
    
    num_apa_right = 0
    num_dimana_right = 0
    num_kapan_right = 0
    num_siapa_right = 0
    num_bagaimana_right = 0
    num_kenapa_right = 0
    num_berapa_right = 0
    num_others_right = 0

    num_apa_wrong = 0
    num_dimana_wrong = 0
    num_kapan_wrong = 0
    num_siapa_wrong = 0
    num_bagaimana_wrong = 0
    num_kenapa_wrong = 0
    num_berapa_wrong = 0
    num_others_wrong = 0

    under_hundred_right = 0
    _101_to_150_right = 0
    _151_to_200_right = 0
    _201_to_250_right = 0
    _251_to_300_right = 0
    _over_301_right = 0

    under_hundred_wrong = 0
    _101_to_150_wrong = 0
    _151_to_200_wrong = 0
    _201_to_250_wrong = 0
    _251_to_300_wrong = 0
    _over_301_wrong = 0

    q_one_to_five_right = 0
    q_six_to_ten_right = 0
    q_eleven_to_fifteen_right = 0
    q_sixteen_to_twenty_right = 0
    q_over_twenty_right = 0

    q_one_to_five_wrong = 0
    q_six_to_ten_wrong = 0
    q_eleven_to_fifteen_wrong = 0
    q_sixteen_to_twenty_wrong = 0
    q_over_twenty_wrong = 0

    a_zero_right = 0
    a_one_to_five_right = 0
    a_six_to_ten_right = 0
    a_eleven_to_fifteen_right = 0
    a_sixteen_to_twenty_right = 0
    a_over_twenty_right = 0

    a_zero_wrong = 0
    a_one_to_five_wrong = 0
    a_six_to_ten_wrong = 0
    a_eleven_to_fifteen_wrong = 0
    a_sixteen_to_twenty_wrong = 0
    a_over_twenty_wrong = 0

    # Cek semua properti EDA, yang berhasil berapa, yang gagal berapa?
    for i in range(len(df)):

        pred_answer_after_filtering = df["Prediction Answer After Filtering"][i][-1]       
        gold_text = df["Gold Answer"][i]
        current_question = df["Question"][i].split()
        len_current_passage = len(df["Context"][i].split())
        len_current_question = len(df["Question"][i].split())
        len_current_gold_text = len(df["Gold Answer"][i].split())

        if (pred_answer_after_filtering == gold_text):
            if 'Apa' in current_question: num_apa_right += 1
            elif 'Apakah' in current_question: num_apa_right += 1
            elif 'apa' in current_question: num_apa_right += 1
            elif 'apakah' in current_question: num_apa_right += 1

            elif 'Dimana' in current_question: num_dimana_right += 1
            elif 'dimana' in current_question: num_dimana_right += 1
            elif 'mana' in current_question: num_dimana_right += 1

            elif 'Kapan' in current_question: num_kapan_right += 1
            elif 'kapan' in current_question: num_kapan_right += 1

            elif 'Siapa' in current_question: num_siapa_right += 1
            elif 'siapa' in current_question: num_siapa_right += 1

            elif 'Bagaimana' in current_question: num_bagaimana_right += 1
            elif 'bagaimana' in current_question: num_bagaimana_right += 1

            elif 'Mengapa' in current_question: num_kenapa_right += 1
            elif 'Kenapa' in current_question: num_kenapa_right += 1
            elif 'mengapa' in current_question: num_kenapa_right += 1
            elif 'kenapa' in current_question: num_kenapa_right += 1

            elif 'Berapa' in current_question: num_berapa_right += 1
            elif 'Berapakah' in current_question: num_berapa_right += 1
            elif 'berapa' in current_question: num_berapa_right += 1
            elif 'berapakah' in current_question: num_berapa_right += 1

            else: num_others_right += 1

            if len_current_passage <= 100: 
                under_hundred_right += 1
            elif len_current_passage >= 101 & len_current_passage <= 150:
                _101_to_150_right += 1
            elif len_current_passage >= 151 & len_current_passage <= 200:
                _151_to_200_right += 1
            elif len_current_passage >= 201 & len_current_passage <= 250:
                _201_to_250_right += 1
            elif len_current_passage >= 251 & len_current_passage <= 300:
                _251_to_300_right += 1
            elif len_current_passage >= 301:
                _over_301_right += 1

            if len_current_question <= 5: 
                q_one_to_five_right += 1
            elif len_current_question >= 6 & len_current_question <= 10:
                q_six_to_ten_right += 1
            elif len_current_question >= 11 & len_current_question <= 15:
                q_eleven_to_fifteen_right += 1
            elif len_current_question >= 16 & len_current_question <= 20:
                q_sixteen_to_twenty_right += 1
            elif len_current_question >= 21: 
                q_over_twenty_right += 1

            if len_current_gold_text <= 5: 
                a_one_to_five_right += 1
            elif len_current_gold_text >= 6 & len_current_gold_text <= 10:
                a_six_to_ten_right += 1
            elif len_current_gold_text >= 11 & len_current_gold_text <= 15:
                a_eleven_to_fifteen_right += 1
            elif len_current_gold_text >= 16 & len_current_gold_text <= 20:
                a_sixteen_to_twenty_right += 1
            elif len_current_gold_text >= 21: 
                a_over_twenty_right += 1
            elif len_current_gold_text == 0:
                a_zero_right += 1

        elif (pred_answer_after_filtering != gold_text):
            if 'Apa' in current_question: num_apa_wrong += 1
            elif 'Apakah' in current_question: num_apa_wrong += 1
            elif 'apa' in current_question: num_apa_wrong += 1
            elif 'apakah' in current_question: num_apa_wrong += 1

            elif 'Dimana' in current_question: num_dimana_wrong += 1
            elif 'dimana' in current_question: num_dimana_wrong += 1
            elif 'mana' in current_question: num_dimana_wrong += 1

            elif 'Kapan' in current_question: num_kapan_wrong += 1
            elif 'kapan' in current_question: num_kapan_wrong += 1

            elif 'Siapa' in current_question: num_siapa_wrong += 1
            elif 'siapa' in current_question: num_siapa_wrong += 1

            elif 'Bagaimana' in current_question: num_bagaimana_wrong += 1
            elif 'bagaimana' in current_question: num_bagaimana_wrong += 1

            elif 'Mengapa' in current_question: num_kenapa_wrong += 1
            elif 'Kenapa' in current_question: num_kenapa_wrong += 1
            elif 'mengapa' in current_question: num_kenapa_wrong += 1
            elif 'kenapa' in current_question: num_kenapa_wrong += 1

            elif 'Berapa' in current_question: num_berapa_wrong += 1
            elif 'Berapakah' in current_question: num_berapa_wrong += 1
            elif 'berapa' in current_question: num_berapa_wrong += 1
            elif 'berapakah' in current_question: num_berapa_wrong += 1

            else: num_others_wrong += 1

            if len_current_passage <= 100: 
                under_hundred_wrong += 1
            elif len_current_passage >= 101 & len_current_passage <= 150:
                _101_to_150_wrong += 1
            elif len_current_passage >= 151 & len_current_passage <= 200:
                _151_to_200_wrong += 1
            elif len_current_passage >= 201 & len_current_passage <= 250:
                _201_to_250_wrong += 1
            elif len_current_passage >= 251 & len_current_passage <= 300:
                _251_to_300_wrong += 1
            elif len_current_passage >= 301:
                _over_301_wrong += 1

            if len_current_question <= 5: 
                q_one_to_five_wrong += 1
            elif len_current_question >= 6 & len_current_question <= 10:
                q_six_to_ten_wrong += 1
            elif len_current_question >= 11 & len_current_question <= 15:
                q_eleven_to_fifteen_wrong += 1
            elif len_current_question >= 16 & len_current_question <= 20:
                q_sixteen_to_twenty_wrong += 1
            elif len_current_question >= 21: 
                q_over_twenty_wrong += 1

            if len_current_gold_text <= 5: 
                a_one_to_five_wrong += 1
            elif len_current_gold_text >= 6 & len_current_gold_text <= 10:
                a_six_to_ten_wrong += 1
            elif len_current_gold_text >= 11 & len_current_gold_text <= 15:
                a_eleven_to_fifteen_wrong += 1
            elif len_current_gold_text >= 16 & len_current_gold_text <= 20:
                a_sixteen_to_twenty_wrong += 1
            elif len_current_gold_text >= 21: 
                a_over_twenty_wrong += 1
            elif len_current_gold_text == 0:
                a_zero_wrong += 1

    assert len(df) == num_apa_right+num_dimana_right+num_kapan_right+num_siapa_right+\
                        num_bagaimana_right+num_kenapa_right+num_berapa_right+num_others_right+\
                        num_apa_wrong+num_dimana_wrong+num_kapan_wrong+num_siapa_wrong+\
                        num_bagaimana_wrong+num_kenapa_wrong+num_berapa_wrong+num_others_wrong

    assert len(df) == under_hundred_right+_101_to_150_right+_151_to_200_right+_201_to_250_right+\
                        _251_to_300_right+_over_301_right+\
                        under_hundred_wrong+_101_to_150_wrong+_151_to_200_wrong+_201_to_250_wrong+\
                        _251_to_300_wrong+_over_301_wrong

    assert len(df) == q_one_to_five_right+q_six_to_ten_right+q_eleven_to_fifteen_right+q_sixteen_to_twenty_right+\
                        q_over_twenty_right+\
                        q_one_to_five_wrong+q_six_to_ten_wrong+q_eleven_to_fifteen_wrong+q_sixteen_to_twenty_wrong+\
                        q_over_twenty_wrong

    assert len(df) == a_one_to_five_right+a_six_to_ten_right+a_eleven_to_fifteen_right+a_sixteen_to_twenty_right+\
                        a_over_twenty_right+a_zero_right+\
                        a_one_to_five_wrong+a_six_to_ten_wrong+a_eleven_to_fifteen_wrong+a_sixteen_to_twenty_wrong+\
                        a_over_twenty_wrong+a_zero_wrong

    # Ambil berapa contoh yang gagal, coba pelajari reasoning type-nya.
    new_df = df.sample(n=15, random_state=42)

    print("--- Bagian tentang question type ---")
    print(f"-- Bagian tentang question type yang terprediksi BENAR --")
    print(f"Banyak pertanyaan APA: {num_apa_right}, sebesar: {round((num_apa_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan DIMANA: {num_dimana_right}, sebesar: {round((num_dimana_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan KAPAN: {num_kapan_right}, sebesar: {round((num_kapan_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan SIAPA: {num_siapa_right}, sebesar: {round((num_siapa_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan BAGAIMANA: {num_bagaimana_right}, sebesar: {round((num_bagaimana_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan KENAPA: {num_kenapa_right}, sebesar: {round((num_kenapa_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan BERAPA: {num_berapa_right}, sebesar: {round((num_berapa_right/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan LAINNYA: {num_others_right}, sebesar: {round((num_others_right/len(df) * 100), 2)} %")
    print()
    print(f"-- Bagian tentang question type yang terprediksi SALAH --")
    print(f"Banyak pertanyaan APA: {num_apa_wrong}, sebesar: {round((num_apa_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan DIMANA: {num_dimana_wrong}, sebesar: {round((num_dimana_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan KAPAN: {num_kapan_wrong}, sebesar: {round((num_kapan_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan SIAPA: {num_siapa_wrong}, sebesar: {round((num_siapa_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan BAGAIMANA: {num_bagaimana_wrong}, sebesar: {round((num_bagaimana_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan KENAPA: {num_kenapa_wrong}, sebesar: {round((num_kenapa_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan BERAPA: {num_berapa_wrong}, sebesar: {round((num_berapa_wrong/len(df) * 100), 2)} %")
    print(f"Banyak pertanyaan LAINNYA: {num_others_wrong}, sebesar: {round((num_others_wrong/len(df) * 100), 2)} %")
    print()
    print(f"-- Presentase kebenaran --")
    print(f"Banyak pertanyaan APA yang terpediksi benar sebesar: {round((num_apa_right/(num_apa_right+num_apa_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan DIMANA yang terpediksi benar sebesar: {round((num_dimana_right/(num_dimana_right+num_dimana_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan KAPAN yang terpediksi benar sebesar: {round((num_kapan_right/(num_kapan_right+num_kapan_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan SIAPA yang terpediksi benar sebesar: {round((num_siapa_right/(num_siapa_right+num_siapa_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan BAGAIMANA yang terpediksi benar sebesar: {round((num_bagaimana_right/(num_bagaimana_right+num_bagaimana_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan KENAPA yang terpediksi benar sebesar: {round((num_kenapa_right/(num_kenapa_right+num_kenapa_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan BERAPA yang terpediksi benar sebesar: {round((num_berapa_right/(num_berapa_right+num_berapa_wrong) * 100), 2)} %")
    print(f"Banyak pertanyaan LAINNYA yang terpediksi benar sebesar: {round((num_others_right/(num_others_right+num_others_wrong) * 100), 2)} %")
    print()

    print("--- Bagian tentang panjang context ---")
    print(f"-- Bagian tentang panjang context yang terprediksi BENAR --")
    print(f"Panjang konteks < 100: {under_hundred_right}, sebesar: {round((under_hundred_right/len(df) * 100), 2)} %")
    print(f"Panjang konteks 101 <= x <= 150: {_101_to_150_right}, sebesar: {round((_101_to_150_right/len(df) * 100), 2)} %")
    print(f"Panjang konteks 151 <= x <= 200: {_151_to_200_right}, sebesar: {round((_151_to_200_right/len(df) * 100), 2)} %")
    print(f"Panjang konteks 201 <= x <= 250: {_201_to_250_right}, sebesar: {round((_201_to_250_right/len(df) * 100), 2)} %")
    print(f"Panjang konteks 251 <= x <= 300: {_251_to_300_right}, sebesar: {round((_251_to_300_right/len(df) * 100), 2)} %")
    print(f"Panjang konteks > 300: {_over_301_right}, sebesar: {round((_over_301_right/len(df) * 100), 2)} %")
    print()
    print(f"-- Bagian tentang panjang context yang terprediksi SALAH --")
    print(f"Panjang konteks < 100: {under_hundred_wrong}, sebesar: {round((under_hundred_wrong/len(df) * 100), 2)} %")
    print(f"Panjang konteks 101 <= x <= 150: {_101_to_150_wrong}, sebesar: {round((_101_to_150_wrong/len(df) * 100), 2)} %")
    print(f"Panjang konteks 151 <= x <= 200: {_151_to_200_wrong}, sebesar: {round((_151_to_200_wrong/len(df) * 100), 2)} %")
    print(f"Panjang konteks 201 <= x <= 250: {_201_to_250_wrong}, sebesar: {round((_201_to_250_wrong/len(df) * 100), 2)} %")
    print(f"Panjang konteks 251 <= x <= 300: {_251_to_300_wrong}, sebesar: {round((_251_to_300_wrong/len(df) * 100), 2)} %")
    print(f"Panjang konteks > 300: {_over_301_wrong}, sebesar: {round((_over_301_wrong/len(df) * 100), 2)} %")
    print()
    print(f"-- Presentase kebenaran --")
    print(f"Panjang konteks < 100 yang terprediksi benar sebesar: {(under_hundred_right+under_hundred_wrong) and round((under_hundred_right/(under_hundred_right+under_hundred_wrong) * 100), 2)} %")
    print(f"Panjang konteks 101 <= x <= 150 yang terprediksi benar sebesar: {(_101_to_150_right+_101_to_150_wrong) and round((_101_to_150_right/(_101_to_150_right+_101_to_150_wrong) * 100), 2)} %")
    print(f"Panjang konteks 151 <= x <= 200 yang terprediksi benar sebesar: {(_151_to_200_right+_151_to_200_wrong) and round((_151_to_200_right/(_151_to_200_right+_151_to_200_wrong) * 100), 2)} %")
    print(f"Panjang konteks 201 <= x <= 250 yang terprediksi benar sebesar: {(_201_to_250_right+_201_to_250_wrong) and round((_201_to_250_right/(_201_to_250_right+_201_to_250_wrong) * 100), 2)} %")
    print(f"Panjang konteks 251 <= x <= 300 yang terprediksi benar sebesar: {(_251_to_300_right+_251_to_300_wrong) and round((_251_to_300_right/(_251_to_300_right+_251_to_300_wrong) * 100), 2)} %")
    print(f"Panjang konteks > 300 yang terprediksi benar sebesar: {(_over_301_right+_over_301_wrong) and round((_over_301_right/(_over_301_right+_over_301_wrong) * 100), 2)} %")
    print()

    print("--- Bagian tentang panjang question ---")
    print(f"-- Bagian tentang panjang question yang terprediksi BENAR --")
    print(f"Panjang question 1 <= x <= 5: {q_one_to_five_right}, sebesar: {round((q_one_to_five_right/len(df) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10: {q_six_to_ten_right}, sebesar: {round((q_six_to_ten_right/len(df) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15: {q_eleven_to_fifteen_right}, sebesar: {round((q_eleven_to_fifteen_right/len(df) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20: {q_sixteen_to_twenty_right}, sebesar: {round((q_sixteen_to_twenty_right/len(df) * 100), 2)} %")
    print(f"Panjang question > 20: {q_over_twenty_right}, sebesar: {round((q_over_twenty_right/len(df) * 100), 2)} %")
    print()
    print(f"-- Bagian tentang panjang question yang terprediksi SALAH --")
    print(f"Panjang question 1 <= x <= 5: {q_one_to_five_wrong}, sebesar: {round((q_one_to_five_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10: {q_six_to_ten_wrong}, sebesar: {round((q_six_to_ten_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15: {q_eleven_to_fifteen_wrong}, sebesar: {round((q_eleven_to_fifteen_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20: {q_sixteen_to_twenty_wrong}, sebesar: {round((q_sixteen_to_twenty_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question > 20: {q_over_twenty_wrong}, sebesar: {round((q_over_twenty_wrong/len(df) * 100), 2)} %")
    print()
    print(f"-- Presentase kebenaran --")
    print(f"Panjang question 1 <= x <= 5 yang terprediksi benar sebesar: {(q_one_to_five_right+q_one_to_five_wrong) and round((q_one_to_five_right/(q_one_to_five_right+q_one_to_five_wrong) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10 yang terprediksi benar sebesar: {(q_six_to_ten_right+q_six_to_ten_wrong) and round((q_six_to_ten_right/(q_six_to_ten_right+q_six_to_ten_wrong) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15 yang terprediksi benar sebesar: {(q_eleven_to_fifteen_right+q_eleven_to_fifteen_wrong) and round((q_eleven_to_fifteen_right/(q_eleven_to_fifteen_right+q_eleven_to_fifteen_wrong) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20 yang terprediksi benar sebesar: {(q_sixteen_to_twenty_right+q_sixteen_to_twenty_wrong) and round((q_sixteen_to_twenty_right/(q_sixteen_to_twenty_right+q_sixteen_to_twenty_wrong) * 100), 2)} %")
    print(f"Panjang question > 20 yang terprediksi benar sebesar: {round((q_over_twenty_right+q_over_twenty_wrong) and (q_over_twenty_right/(q_over_twenty_right+q_over_twenty_wrong) * 100), 2)} %")
    print()

    print("--- Bagian tentang panjang gold answer ---")
    print(f"-- Bagian tentang panjang gold answer yang terprediksi BENAR --")
    print(f"Panjang question 1 <= x <= 5: {a_one_to_five_right}, sebesar: {round((a_one_to_five_right/len(df) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10: {a_six_to_ten_right}, sebesar: {round((a_six_to_ten_right/len(df) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15: {a_eleven_to_fifteen_right}, sebesar: {round((a_eleven_to_fifteen_right/len(df) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20: {a_sixteen_to_twenty_right}, sebesar: {round((a_sixteen_to_twenty_right/len(df) * 100), 2)} %")
    print(f"Panjang question > 20: {a_over_twenty_right}, sebesar: {round((a_over_twenty_right/len(df) * 100), 2)} %")
    print()
    print(f"-- Bagian tentang panjang gold answer yang terprediksi SALAH --")
    print(f"Panjang question 1 <= x <= 5: {a_one_to_five_wrong}, sebesar: {round((a_one_to_five_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10: {a_six_to_ten_wrong}, sebesar: {round((a_six_to_ten_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15: {a_eleven_to_fifteen_wrong}, sebesar: {round((a_eleven_to_fifteen_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20: {a_sixteen_to_twenty_wrong}, sebesar: {round((a_sixteen_to_twenty_wrong/len(df) * 100), 2)} %")
    print(f"Panjang question > 20: {a_over_twenty_wrong}, sebesar: {round((a_over_twenty_wrong/len(df) * 100), 2)} %")
    print()
    print(f"-- Presentase kebenaran --")
    print(f"Panjang question 1 <= x <= 5 yang terprediksi benar sebesar: {(a_one_to_five_right+a_one_to_five_wrong) and round((a_one_to_five_right/(a_one_to_five_right+a_one_to_five_wrong) * 100), 2)} %")
    print(f"Panjang question 6 <= x <= 10 yang terprediksi benar sebesar: {(a_six_to_ten_right+a_six_to_ten_wrong) and round((a_six_to_ten_right/(a_six_to_ten_right+a_six_to_ten_wrong) * 100), 2)} %")
    print(f"Panjang question 11 <= x <= 15 yang terprediksi benar sebesar: {(a_eleven_to_fifteen_right+a_eleven_to_fifteen_wrong) and round((a_eleven_to_fifteen_right/(a_eleven_to_fifteen_right+a_eleven_to_fifteen_wrong) * 100), 2)} %")
    print(f"Panjang question 16 <= x <= 20 yang terprediksi benar sebesar: {(a_sixteen_to_twenty_right+a_sixteen_to_twenty_wrong) and round((a_sixteen_to_twenty_right/(a_sixteen_to_twenty_right+a_sixteen_to_twenty_wrong) * 100), 2)} %")
    print(f"Panjang question > 20 yang terprediksi benar sebesar: {round((a_over_twenty_right+a_over_twenty_wrong) and (a_over_twenty_right/(a_over_twenty_right+a_over_twenty_wrong) * 100), 2)} %")
    print()

    print("--- Bagian untuk analisis REASONING TYPE ---")
    display(new_df)
    return new_df

In [None]:
general_evaluation(eval_df)

In [None]:
def breakdown_evaluation(df, TYPE_QAS):
    
    if TYPE_QAS == 'entailment only': compatible_label = ['entailment']
    elif TYPE_QAS == 'entailment or neutral': compatible_label = ['entailment', 'neutral']

    exist_true_answer_label_entailment = 0
    exist_true_answer_label_neutral = 0
    exist_true_answer_label_contradiction = 0

    exist_false_answer_label_entailment = 0
    exist_false_answer_label_neutral = 0
    exist_false_answer_label_contradiction = 0

    no_exist_true_answer_label_entailment = 0
    no_exist_true_answer_label_neutral = 0
    no_exist_true_answer_label_contradiction = 0

    no_exist_false_answer_label_entailment = 0
    no_exist_false_answer_label_neutral = 0
    no_exist_false_answer_label_contradiction = 0

    filtered_in_right_answer_to_filtered_in_right_answer = 0
    filtered_in_right_answer_to_filtered_in_wrong_answer = 0
    filtered_in_right_answer_to_filtered_out_right_answer = 0
    filtered_in_right_answer_to_filtered_out_wrong_answer = 0

    filtered_in_wrong_answer_to_filtered_in_right_answer = 0
    filtered_in_wrong_answer_to_filtered_in_wrong_answer = 0
    filtered_in_wrong_answer_to_filtered_out_right_answer = 0
    filtered_in_wrong_answer_to_filtered_out_wrong_answer = 0

    filtered_out_right_answer_to_filtered_in_right_answer = 0
    filtered_out_right_answer_to_filtered_in_wrong_answer = 0
    filtered_out_right_answer_to_filtered_out_right_answer = 0
    filtered_out_right_answer_to_filtered_out_wrong_answer = 0

    filtered_out_wrong_answer_to_filtered_in_right_answer = 0
    filtered_out_wrong_answer_to_filtered_in_wrong_answer = 0
    filtered_out_wrong_answer_to_filtered_out_right_answer = 0
    filtered_out_wrong_answer_to_filtered_out_wrong_answer = 0

    filtered_in_right_answer_to_filtered_in_right_answer_unanswered = 0
    filtered_in_right_answer_to_filtered_in_wrong_answer_unanswered = 0
    filtered_in_right_answer_to_filtered_out_right_answer_unanswered = 0
    filtered_in_right_answer_to_filtered_out_wrong_answer_unanswered = 0

    filtered_in_wrong_answer_to_filtered_in_right_answer_unanswered = 0
    filtered_in_wrong_answer_to_filtered_in_wrong_answer_unanswered = 0
    filtered_in_wrong_answer_to_filtered_out_right_answer_unanswered = 0
    filtered_in_wrong_answer_to_filtered_out_wrong_answer_unanswered = 0

    filtered_out_right_answer_to_filtered_in_right_answer_unanswered = 0
    filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered = 0
    filtered_out_right_answer_to_filtered_out_right_answer_unanswered = 0
    filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered = 0

    filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered = 0
    filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered = 0
    filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered = 0
    filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered = 0

    filtered_score_labels_before_filtering = []
    filtered_score_labels_after_filtering = []

    for i in range(len(df)):

        pred_answer_before_filtering = df["Prediction Answer Before Filtering"][i][-1]
        pred_answer_after_filtering = df["Prediction Answer After Filtering"][i][-1]

        pred_label_before_filtering = df["Label Before Filtering"][i][-1]['label']
        pred_label_after_filtering = df["Label After Filtering"][i][-1]['label']

        pred_prob_dist_before_filtering = df["Label Before Filtering"][i][-1]['score']
        pred_prob_dist_after_filtering = df["Label After Filtering"][i][-1]['score']

        gold_text = df["Gold Answer"][i]

        # Bagian untuk jawaban sebelum filtering SAMA DENGAN ground truth

        if (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'entailment') \
                and (pred_answer_before_filtering != ""): 
            exist_true_answer_label_entailment += 1

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_right_answer_to_filtered_in_right_answer += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_right_answer_to_filtered_in_wrong_answer += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_right_answer_to_filtered_out_right_answer += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_right_answer_to_filtered_out_wrong_answer += 1

        elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'neutral') \
                and (pred_answer_before_filtering != ""): 
            exist_true_answer_label_neutral += 1

            if (TYPE_QAS == 'entailment only'):

                filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_right_answer_to_filtered_in_right_answer += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_right_answer_to_filtered_in_wrong_answer += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_right_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_right_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

            elif (TYPE_QAS == 'entailment or neutral'):
                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_right_answer_to_filtered_in_right_answer += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_right_answer_to_filtered_in_wrong_answer += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_right_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_right_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'contradiction') \
                and (pred_answer_before_filtering != ""): 
            exist_true_answer_label_contradiction += 1

            filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_right_answer_to_filtered_in_right_answer += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_right_answer_to_filtered_in_wrong_answer += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_right_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_right_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        # Bagian untuk jawaban sebelum filtering BERBEDA DENGAN ground truth

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'entailment') \
                and (pred_answer_before_filtering != ""):
            exist_false_answer_label_entailment += 1

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_in_right_answer += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_in_wrong_answer += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'neutral') \
                and (pred_answer_before_filtering != ""):
            exist_false_answer_label_neutral += 1

            if (TYPE_QAS == 'entailment only'):

                filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_in_right_answer += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_in_wrong_answer += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

            elif (TYPE_QAS == 'entailment or neutral'):
                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_in_right_answer += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_in_wrong_answer += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'contradiction') \
                and (pred_answer_before_filtering != ""):
            exist_false_answer_label_contradiction += 1

            filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_in_right_answer += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_in_wrong_answer += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_out_right_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_out_wrong_answer += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        # Bagian untuk jawaban sebelum filtering SAMA DENGAN ground truth (unanswered)

        elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'entailment') \
                and (pred_answer_before_filtering == ""): 
            no_exist_true_answer_label_entailment += 1

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_right_answer_to_filtered_in_right_answer_unanswered += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_right_answer_to_filtered_in_wrong_answer_unanswered += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_right_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_right_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'neutral') \
                and (pred_answer_before_filtering == ""): 
            no_exist_true_answer_label_neutral += 1

            if (TYPE_QAS == 'entailment only'):

                filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_right_answer_to_filtered_in_right_answer_unanswered += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_right_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

            elif (TYPE_QAS == 'entailment or neutral'):
                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_right_answer_to_filtered_in_right_answer_unanswered += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_right_answer_to_filtered_in_wrong_answer_unanswered += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_right_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_right_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering == gold_text) and (pred_label_before_filtering == 'contradiction') \
                and (pred_answer_before_filtering == ""): 
            no_exist_true_answer_label_contradiction += 1

            filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_right_answer_to_filtered_in_right_answer_unanswered += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_right_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        # Bagian untuk jawaban sebelum filtering BERBEDA DENGAN ground truth (unanswered)

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'entailment') \
                and (pred_answer_before_filtering == ""):
            no_exist_false_answer_label_entailment += 1

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_in_right_answer_unanswered += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_in_wrong_answer_unanswered += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_in_wrong_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'neutral') \
                and (pred_answer_before_filtering == ""):
            no_exist_false_answer_label_neutral += 1

            if (TYPE_QAS == 'entailment only'):

                filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

            elif (TYPE_QAS == 'entailment or neutral'):
                if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_in_right_answer_unanswered += 1
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_in_wrong_answer_unanswered += 1
                elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
                elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                        : filtered_in_wrong_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        elif (pred_answer_before_filtering != gold_text) and (pred_label_before_filtering == 'contradiction') \
                and (pred_answer_before_filtering == ""):
            no_exist_false_answer_label_contradiction += 1

            filtered_score_labels_before_filtering.append(pred_prob_dist_before_filtering)

            if (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered += 1
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered += 1
            elif (pred_answer_after_filtering == gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)
            elif (pred_answer_after_filtering != gold_text) and (pred_label_after_filtering not in compatible_label) \
                    : filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered += 1; filtered_score_labels_after_filtering.append(pred_prob_dist_after_filtering)

        #print(f"Pred answer before filtering: {pred_answer_before_filtering}")
        #print(f"Pred answer after filtering: {pred_answer_after_filtering}")
        #print(f"Gold answer: {gold_text}")
        #print()

    print(f"--- Bagian ini hanya memperhatikan sebelum filtering ---")
    print(f"Jawaban benar (answer exist) entailment: {exist_true_answer_label_entailment}, sebesar: {round(exist_true_answer_label_entailment/len(df) * 100, 2)} %")
    print(f"Jawaban benar (answer exist) neutral: {exist_true_answer_label_neutral}, sebesar: {round(exist_true_answer_label_neutral/len(df) * 100, 2)} %")
    print(f"Jawaban benar (answer exist) contradiction: {exist_true_answer_label_contradiction}, sebesar: {round(exist_true_answer_label_contradiction/len(df) * 100, 2)} %")
    print()
    print(f"Jawaban salah (answer exist) entailment: {exist_false_answer_label_entailment}, sebesar: {round(exist_false_answer_label_entailment/len(df) * 100, 2)} %")
    print(f"Jawaban salah (answer exist) neutral: {exist_false_answer_label_neutral}, sebesar: {round(exist_false_answer_label_neutral/len(df) * 100, 2)} %")
    print(f"Jawaban salah (answer exist) contradiction: {exist_false_answer_label_contradiction}, sebesar: {round(exist_false_answer_label_contradiction/len(df) * 100, 2)} %")
    print()
    print(f"Jawaban benar (answer DO NOT exist) entailment: {no_exist_true_answer_label_entailment}, sebesar: {round(no_exist_true_answer_label_entailment/len(df) * 100, 2)} %")
    print(f"Jawaban benar (answer DO NOT exist) neutral: {no_exist_true_answer_label_neutral}, sebesar: {round(no_exist_true_answer_label_neutral/len(df) * 100, 2)} %")
    print(f"Jawaban benar (answer DO NOT exist) contradiction: {no_exist_true_answer_label_contradiction}, sebesar: {round(no_exist_true_answer_label_contradiction/len(df) * 100, 2)} %")
    print()
    print(f"Jawaban salah (answer DO NOT exist) entailment: {no_exist_false_answer_label_entailment}, sebesar: {round(no_exist_false_answer_label_entailment/len(df) * 100, 2)} %")
    print(f"Jawaban salah (answer DO NOT exist) neutral: {no_exist_false_answer_label_neutral}, sebesar: {round(no_exist_false_answer_label_neutral/len(df) * 100, 2)} %")
    print(f"Jawaban salah (answer DO NOT exist) contradiction: {no_exist_false_answer_label_contradiction}, sebesar: {round(no_exist_false_answer_label_contradiction/len(df) * 100, 2)} %")
    print()

    print(f"--- Bagian ini memperhatikan sebelum filtering dan setelah filtering ---")

    """
    print(f"Banyaknya data sebelum filtering: BENAR & LOLOS, setelah filtering: BENAR & LOLOS: {filtered_in_right_answer_to_filtered_in_right_answer}, sebesar: {round(filtered_in_right_answer_to_filtered_in_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & LOLOS, setelah filtering: SALAH & LOLOS: {filtered_in_right_answer_to_filtered_in_wrong_answer}, sebesar: {round(filtered_in_right_answer_to_filtered_in_wrong_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & LOLOS, setelah filtering: BENAR & TERFILTER: {filtered_in_right_answer_to_filtered_out_right_answer}, sebesar: {round(filtered_in_right_answer_to_filtered_out_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & LOLOS, setelah filtering: SALAH & TERFILTER: {filtered_in_right_answer_to_filtered_out_wrong_answer}, sebesar: {round(filtered_in_right_answer_to_filtered_out_wrong_answer/len(df) * 100, 2)} %")
    print()

    print(f"Banyaknya data sebelum filtering: SALAH & LOLOS, setelah filtering: BENAR & LOLOS: {filtered_in_wrong_answer_to_filtered_in_right_answer}, sebesar: {round(filtered_in_wrong_answer_to_filtered_in_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & LOLOS, setelah filtering: SALAH & LOLOS: {filtered_in_wrong_answer_to_filtered_in_wrong_answer}, sebesar: {round(filtered_in_wrong_answer_to_filtered_in_wrong_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & LOLOS, setelah filtering: BENAR & TERFILTER: {filtered_in_wrong_answer_to_filtered_out_right_answer}, sebesar: {round(filtered_in_wrong_answer_to_filtered_out_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & LOLOS, setelah filtering: SALAH & TERFILTER: {filtered_in_wrong_answer_to_filtered_out_wrong_answer}, sebesar: {round(filtered_in_wrong_answer_to_filtered_out_wrong_answer/len(df) * 100, 2)} %")
    print()
    """
    print(f"Banyaknya data sebelum filtering: BENAR & TERFILTER, setelah filtering: BENAR & LOLOS: {filtered_out_right_answer_to_filtered_in_right_answer}, sebesar: {round(filtered_out_right_answer_to_filtered_in_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & TERFILTER, setelah filtering: SALAH & LOLOS: {filtered_out_right_answer_to_filtered_in_wrong_answer}, sebesar: {round(filtered_out_right_answer_to_filtered_in_wrong_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & TERFILTER, setelah filtering: BENAR & TERFILTER: {filtered_out_right_answer_to_filtered_out_right_answer}, sebesar: {round(filtered_out_right_answer_to_filtered_out_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: BENAR & TERFILTER, setelah filtering: SALAH & TERFILTER: {filtered_out_right_answer_to_filtered_out_wrong_answer}, sebesar: {round(filtered_out_right_answer_to_filtered_out_wrong_answer/len(df) * 100, 2)} %")
    print()

    print(f"Banyaknya data sebelum filtering: SALAH & TERFILTER, setelah filtering: BENAR & LOLOS: {filtered_out_wrong_answer_to_filtered_in_right_answer}, sebesar: {round(filtered_out_wrong_answer_to_filtered_in_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & TERFILTER, setelah filtering: SALAH & LOLOS: {filtered_out_wrong_answer_to_filtered_in_wrong_answer}, sebesar: {round(filtered_out_wrong_answer_to_filtered_in_wrong_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & TERFILTER, setelah filtering: BENAR & TERFILTER: {filtered_out_wrong_answer_to_filtered_out_right_answer}, sebesar: {round(filtered_out_wrong_answer_to_filtered_out_right_answer/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering: SALAH & TERFILTER, setelah filtering: SALAH & TERFILTER: {filtered_out_wrong_answer_to_filtered_out_wrong_answer}, sebesar: {round(filtered_out_wrong_answer_to_filtered_out_wrong_answer/len(df) * 100, 2)} %")
    print()
    """
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & LOLOS, setelah filtering: BENAR & LOLOS: {filtered_in_right_answer_to_filtered_in_right_answer_unanswered}, sebesar: {round(filtered_in_right_answer_to_filtered_in_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & LOLOS, setelah filtering: SALAH & LOLOS: {filtered_in_right_answer_to_filtered_in_wrong_answer_unanswered}, sebesar: {round(filtered_in_right_answer_to_filtered_in_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & LOLOS, setelah filtering: BENAR & TERFILTER: {filtered_in_right_answer_to_filtered_out_right_answer_unanswered}, sebesar: {round(filtered_in_right_answer_to_filtered_out_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & LOLOS, setelah filtering: SALAH & TERFILTER: {filtered_in_right_answer_to_filtered_out_wrong_answer_unanswered}, sebesar: {round(filtered_in_right_answer_to_filtered_out_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print()

    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & LOLOS, setelah filtering: BENAR & LOLOS: {filtered_in_wrong_answer_to_filtered_in_right_answer_unanswered}, sebesar: {round(filtered_in_wrong_answer_to_filtered_in_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & LOLOS, setelah filtering: SALAH & LOLOS: {filtered_in_wrong_answer_to_filtered_in_wrong_answer_unanswered}, sebesar: {round(filtered_in_wrong_answer_to_filtered_in_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & LOLOS, setelah filtering: BENAR & TERFILTER: {filtered_in_wrong_answer_to_filtered_out_right_answer_unanswered}, sebesar: {round(filtered_in_wrong_answer_to_filtered_out_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & LOLOS, setelah filtering: SALAH & TERFILTER: {filtered_in_wrong_answer_to_filtered_out_wrong_answer_unanswered}, sebesar: {round(filtered_in_wrong_answer_to_filtered_out_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print()
    """
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & TERFILTER, setelah filtering: BENAR & LOLOS: {filtered_out_right_answer_to_filtered_in_right_answer_unanswered}, sebesar: {round(filtered_out_right_answer_to_filtered_in_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & TERFILTER, setelah filtering: SALAH & LOLOS: {filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered}, sebesar: {round(filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & TERFILTER, setelah filtering: BENAR & TERFILTER: {filtered_out_right_answer_to_filtered_out_right_answer_unanswered}, sebesar: {round(filtered_out_right_answer_to_filtered_out_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): BENAR & TERFILTER, setelah filtering: SALAH & TERFILTER: {filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered}, sebesar: {round(filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print()

    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & TERFILTER, setelah filtering: BENAR & LOLOS: {filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered}, sebesar: {round(filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & TERFILTER, setelah filtering: SALAH & LOLOS: {filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered}, sebesar: {round(filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & TERFILTER, setelah filtering: BENAR & TERFILTER: {filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered}, sebesar: {round(filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered/len(df) * 100, 2)} %")
    print(f"Banyaknya data sebelum filtering (unanswered): SALAH & TERFILTER, setelah filtering: SALAH & TERFILTER: {filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered}, sebesar: {round(filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered/len(df) * 100, 2)} %")
    print()

    print("-- Pada pengecekan filtering awal: --")
    if TYPE_QAS == 'entailment only':
        accept_right = (exist_true_answer_label_entailment) \
            / (exist_true_answer_label_entailment + exist_true_answer_label_neutral + exist_true_answer_label_contradiction)
        reject_wrong = (exist_false_answer_label_neutral + exist_false_answer_label_contradiction) \
            / (exist_false_answer_label_entailment + exist_false_answer_label_neutral + exist_false_answer_label_contradiction)
        print(f"Berhasil menerima {round(accept_right * 100, 2)} % jawaban yang benar (answer exist)")
        print(f"Berhasil menolak {round(reject_wrong * 100, 2)} % jawaban yang salah (answer exist)") 
        print()

        no_exist_accept_right = (no_exist_true_answer_label_entailment) \
            / (no_exist_true_answer_label_entailment + no_exist_true_answer_label_neutral + no_exist_true_answer_label_contradiction)
        no_exist_reject_wrong = (no_exist_false_answer_label_neutral + no_exist_false_answer_label_contradiction) \
            / (no_exist_false_answer_label_entailment + no_exist_false_answer_label_neutral + no_exist_false_answer_label_contradiction)
        print(f"Berhasil menerima {round(no_exist_accept_right * 100, 2)} % jawaban yang benar (answer DO NOT exist)")
        print(f"Berhasil menolak {round(no_exist_reject_wrong * 100, 2)} % jawaban yang salah (answer DO NOT exist)") 
        print()

    elif TYPE_QAS == 'entailment or neutral':
        accept_right = (exist_true_answer_label_entailment + exist_true_answer_label_neutral) \
            / (exist_true_answer_label_entailment + exist_true_answer_label_neutral + exist_true_answer_label_contradiction)
        reject_wrong = (exist_false_answer_label_contradiction) \
            / (exist_false_answer_label_entailment + exist_false_answer_label_neutral + exist_false_answer_label_contradiction)
        print(f"Berhasil menerima {round(accept_right * 100, 2)} % jawaban yang benar (answer exist)")
        print(f"Berhasil menolak {round(reject_wrong * 100, 2)} % jawaban yang salah (answer exist)") 
        print()

        no_exist_accept_right = (no_exist_true_answer_label_entailment + no_exist_true_answer_label_neutral) \
            / (no_exist_true_answer_label_entailment + no_exist_true_answer_label_neutral + no_exist_true_answer_label_contradiction)
        no_exist_reject_wrong = (no_exist_false_answer_label_contradiction) \
            / (no_exist_false_answer_label_entailment + no_exist_false_answer_label_neutral + no_exist_false_answer_label_contradiction)
        print(f"Berhasil menerima {round(no_exist_accept_right * 100, 2)} % jawaban yang benar (answer DO NOT exist)")
        print(f"Berhasil menolak {round(no_exist_reject_wrong * 100, 2)} % jawaban yang salah (answer DO NOT exist)") 
        print()

    print("-- Setelah pengecekan filtering berdasarkan hasil akhir MSI: --")
    accept_right_after_filtering = (filtered_out_right_answer_to_filtered_in_right_answer + filtered_out_wrong_answer_to_filtered_in_right_answer) \
        / (filtered_out_right_answer_to_filtered_in_right_answer + filtered_out_wrong_answer_to_filtered_in_right_answer + filtered_out_right_answer_to_filtered_out_right_answer + filtered_out_wrong_answer_to_filtered_out_right_answer)

    reject_wrong_after_filtering = (filtered_out_right_answer_to_filtered_out_wrong_answer + filtered_out_wrong_answer_to_filtered_out_wrong_answer) \
    / (filtered_out_right_answer_to_filtered_out_wrong_answer + filtered_out_wrong_answer_to_filtered_out_wrong_answer + filtered_out_right_answer_to_filtered_in_wrong_answer + filtered_out_wrong_answer_to_filtered_in_wrong_answer)

    print(f"Berhasil menerima {round(accept_right_after_filtering * 100, 2)} % jawaban yang benar (answer exist)")
    print(f"Berhasil menolak {round(reject_wrong_after_filtering * 100, 2)} % jawaban yang salah (answer exist)") 
    print()

    no_exist_accept_right_after_filtering = (filtered_out_right_answer_to_filtered_in_right_answer_unanswered + filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered) \
        / (filtered_out_right_answer_to_filtered_in_right_answer_unanswered + filtered_out_wrong_answer_to_filtered_in_right_answer_unanswered + filtered_out_right_answer_to_filtered_out_right_answer_unanswered + filtered_out_wrong_answer_to_filtered_out_right_answer_unanswered)

    no_exist_reject_wrong_after_filtering = (filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered + filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered) \
    / (filtered_out_right_answer_to_filtered_out_wrong_answer_unanswered + filtered_out_wrong_answer_to_filtered_out_wrong_answer_unanswered + filtered_out_right_answer_to_filtered_in_wrong_answer_unanswered + filtered_out_wrong_answer_to_filtered_in_wrong_answer_unanswered)

    print(f"Berhasil menerima {round(no_exist_accept_right_after_filtering * 100, 2)} % jawaban yang benar (answer DO NOT exist)")
    print(f"Berhasil menolak {round(no_exist_reject_wrong_after_filtering * 100, 2)} % jawaban yang salah (answer DO NOT exist)") 
    print()

    print(f"Rerata skor yang membuat data menjadi TERFILTER sebelum iterasi MSI: {np.mean(filtered_score_labels_before_filtering)}")
    print(f"Rerata skor yang membuat data menjadi TERFILTER setelah iterasi MSI: {np.mean(filtered_score_labels_after_filtering)}")
    print(f"Total prediksi jawaban: {len(df)}")
    print()

    assert len(df) == exist_true_answer_label_entailment+exist_true_answer_label_neutral+exist_true_answer_label_contradiction+\
            exist_false_answer_label_entailment+exist_false_answer_label_neutral+exist_false_answer_label_contradiction+\
            no_exist_true_answer_label_entailment+no_exist_true_answer_label_neutral+no_exist_true_answer_label_contradiction+\
            no_exist_false_answer_label_entailment+no_exist_false_answer_label_neutral+no_exist_false_answer_label_contradiction

In [None]:
breakdown_evaluation(eval_df, TYPE_QAS='entailment only')

In [None]:
#eval_df.to_csv(f'{OUTPUT_DIR}/eval_df.csv', index=False)

In [None]:
eval_df

In [None]:
eval_df[eval_df['Question'].str.contains("apa |apakah ")]