In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from collections import OrderedDict
import pandas as pd
import re
from tqdm import tqdm
import string
import collections
import numpy as np

from io import StringIO
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
prefix = "https://huggingface.co/muhammadravi251001/fine-tuned-FilteringNLI-"
suffix = "/blob/main/results/evaluation/metric_result.txt"

suffix_df = "/raw/main/results/output/output_df.csv"
suffix_squadid_df = "/resolve/main/results/output/output_df.csv"

In [3]:
def take_all_url_nli_validator(data, df=False):
    
    if not df:
        baseline = f"{prefix}indonli_mnli-{data}-TQ2-TS4-MS3-VA0-TH0.0{suffix}"
    else:
        if data == "idkmrc" or data == "tydiqaid":
            baseline = f"{prefix}indonli_mnli-{data}-TQ2-TS4-MS3-VA0-TH0.0{suffix_df}"
        elif data == "squadid":
            baseline = f"{prefix}indonli_mnli-{data}-TQ2-TS4-MS3-VA0-TH0.0{suffix_squadid_df}"
    
    msc_arr = ["indonli", "indonli_mnli", f"indonli_mnli_{data}-nli"]
    tq_arr = [1, 2]
    msi_arr = [1, 2, 3]
    var_arr = [1, 2, 3]
    th_arr = [0.25, 0.5, 0.75]

    url_arr = []
    url_arr.append(baseline)

    for msc in msc_arr:
        for tq in tq_arr:
            for msi in msi_arr:
                for var in var_arr:
                    for th in th_arr:
                        
                        if msc == "indonli" or msc == "indonli_mnli":
                            var = 1
                            th = 0.0

                        if var == 1:
                            th = 0.0
                            
                        if (var == 3) and (msi == 1):
                            continue
                        
                        if not df:
                            url_arr.append(f"{prefix}{msc}-{data}-TQ{tq}-TS4-MS{msi}-VA{var}-TH{th}{suffix}")
                        else:
                            if data == "idkmrc" or data == "tydiqaid":
                                url_arr.append(f"{prefix}{msc}-{data}-TQ{tq}-TS4-MS{msi}-VA{var}-TH{th}{suffix_df}")
                            elif data == "squadid":
                                url_arr.append(f"{prefix}{msc}-{data}-TQ{tq}-TS4-MS{msi}-VA{var}-TH{th}{suffix_squadid_df}")
    
    url_arr = list(OrderedDict.fromkeys(url_arr))
    print("Amount of url:", len(url_arr))
    return url_arr

In [4]:
def take_dataframe(url): 
    
    response = requests.get(url)
    
    if response.status_code == 200:
        csv_data = StringIO(response.text)
        df = pd.read_csv(csv_data, index_col=0)
        df = df[['Prediction Answer', 'Rec. Pred Answer', 'Gold Answer', 'Properties']]
        df.fillna('', inplace=True)
    
    else:
        print("Failed to download CSV")
    
    return df

In [5]:
def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(pred, gold):
    pred_tokens = normalize_text(pred).split() # True positive + False positive = Untuk precision
    gold_tokens = normalize_text(gold).split() # True positive + False negatives = Untuk recall
    common = collections.Counter(pred_tokens) & collections.Counter(gold_tokens)
    num_same = sum(common.values()) # True positive

    if len(gold_tokens) == 0 or len(pred_tokens) == 0: 
        return int(gold_tokens == pred_tokens)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_tokens)
    recall = 1.0 * num_same / len(gold_tokens)
    f1 = (2.0 * precision * recall) / (precision + recall)

    return f1

def compute_metrics_from_df(df):

    denominator = len(df)
    total_correct = 0
    f1_array = []

    for i in range(len(df)):

        pred_answer = df["Prediction Answer"][i]
        gold_answer = df["Gold Answer"][i]

        if pred_answer == gold_answer:
            total_correct += 1

        f1 = compute_f1(pred=pred_answer, gold=gold_answer)

        f1_array.append(f1)

    exact_match = ((total_correct / denominator) * 100.0)
    final_f1 = np.mean(f1_array) * 100.0

    return {'exact_match': exact_match, 'f1': final_f1}

In [6]:
def take_accepted_and_rejected_qa_em_f1_with_nli(url, variation):
    
    df = take_dataframe(url)
    
    if variation == "all":
        pass
    elif variation == "answerable":
        df = df[df['Gold Answer'] != ""].reset_index(drop=True)
    elif variation == "unanswerable":
        df = df[df['Gold Answer'] == ""].reset_index(drop=True)
    
    right_pred_model_qa_and_accepted_by_nli_ctr = 0
    right_pred_model_qa_and_rejected_by_nli_ctr = 0
    wrong_pred_model_qa_and_accepted_by_nli_ctr = 0
    wrong_pred_model_qa_and_rejected_by_nli_ctr = 0
    
    for i in range(len(df)):
        
        gold_answer = df['Gold Answer'][i]
        qa_generated_answer = eval(df['Rec. Pred Answer'][i])[0]
        nli_validated_answer = df['Prediction Answer'][i]
    
        # Jawaban dari model qa yg benar & di-accept sama model NLI
        right_pred_model_qa_and_accepted_by_nli = (gold_answer == qa_generated_answer) and \
            (qa_generated_answer == nli_validated_answer)

        # Jawaban dari model qa yg benar & di-reject sama model NLI
        right_pred_model_qa_and_rejected_by_nli = (gold_answer == qa_generated_answer) and \
            (qa_generated_answer != nli_validated_answer)

        # Jawaban dari model qa yg salah & di-accept sama model NLI
        wrong_pred_model_qa_and_accepted_by_nli = (gold_answer != qa_generated_answer) and \
            (qa_generated_answer == nli_validated_answer)

        # Jawaban dari model qa yg salah & di-reject sama model NLI
        wrong_pred_model_qa_and_rejected_by_nli = (gold_answer != qa_generated_answer) and \
            (qa_generated_answer != nli_validated_answer)
        
        if right_pred_model_qa_and_accepted_by_nli:
            right_pred_model_qa_and_accepted_by_nli_ctr += 1
        elif right_pred_model_qa_and_rejected_by_nli:
            right_pred_model_qa_and_rejected_by_nli_ctr += 1
        elif wrong_pred_model_qa_and_accepted_by_nli:
            wrong_pred_model_qa_and_accepted_by_nli_ctr += 1
        elif wrong_pred_model_qa_and_rejected_by_nli:
            wrong_pred_model_qa_and_rejected_by_nli_ctr += 1
            
    metric = compute_metrics_from_df(df)
    EM = metric['exact_match']
    F1 = metric['f1']
    
    return EM, F1, right_pred_model_qa_and_accepted_by_nli_ctr, right_pred_model_qa_and_rejected_by_nli_ctr, \
            wrong_pred_model_qa_and_accepted_by_nli_ctr, wrong_pred_model_qa_and_rejected_by_nli_ctr

In [7]:
def extract_values_nli_validator(url, data):

    msc_list = f"indonli_mnli_{data}-nli|indonli_mnli|indonli"
    pattern = re.compile(r'.*FilteringNLI-({})-{}-TQ(\d+)-TS(\d+)-MS([\d]+)-VA([\d]+)-TH([\d.]+).*'.format(msc_list, data))
    match = pattern.match(url)
    
    if match:
        msc = match.group(1)
        TQ = int(match.group(2))
        TS = int(match.group(3))
        MSI = int(match.group(4))
        VA = int(match.group(5))
        
        TH = float(match.group(6))
        if TH == '0.00':
            TH = '0.0'
        elif TH == '0.50':
            TH = '0.5'

        return msc, data, TQ, TS, MSI, VA, TH
    
    else:
        return None

In [8]:
def create_table_nli_validator(url_list, url_list_df, data, variation):
    
    data_list = []
    for url in tqdm(url_list, desc="Processing first URLs", unit="URL"):
        extracted_values = extract_values_nli_validator(url, data)
        data_list.append(extracted_values)

    df = pd.DataFrame(data_list, columns=['MSC', 'data', 'TQ', 'TS', 'MSI', 'VA', 'TH'])
    
    for idx, url_df in enumerate(tqdm(url_list_df, desc="Processing second URLs", unit="URL")):
        extracted_values = take_accepted_and_rejected_qa_em_f1_with_nli(url_df, variation)
        df.loc[idx, ['EM', 'F1', 'RA', 'RR', 'WA', 'WR']] = extracted_values
    
    df[['RA', 'RR', 'WA', 'WR']] = df[['RA', 'RR', 'WA', 'WR']].astype(int)
    df[['EM', 'F1']] = df[['EM', 'F1']].astype(float)
    
    df.loc[0, 'MSC'] = "Baseline"
    df.loc[0, 'TQ'] = "BL"
    df.loc[0, 'TS'] = "BL"
    df.loc[0, 'MSI'] = "BL"
    df.loc[0, 'VA'] = "BL"
    df.loc[0, 'TH'] = "BL"
    
    return df

# All

In [9]:
data = "idkmrc"
variation = "all"
url_list = take_all_url_nli_validator(data)
url_list_df = take_all_url_nli_validator(data, df=True)
table_all = pd.DataFrame(create_table_nli_validator(url_list, url_list_df, data, variation=variation))
#table_all.to_excel(f'{data}-nli-validator.xlsx', index=False)
table_all

Amount of url: 49
Amount of url: 49


Processing first URLs: 100%|████████████████████████████████████████████████████████| 49/49 [00:00<00:00, 68279.37URL/s]
Processing second URLs: 100%|██████████████████████████████████████████████████████████| 49/49 [00:32<00:00,  1.50URL/s]


Unnamed: 0,MSC,data,TQ,TS,MSI,VA,TH,EM,F1,RA,RR,WA,WR
0,Baseline,idkmrc,BL,BL,BL,BL,BL,75.829384,84.251283,640,0,204,0
1,indonli,idkmrc,1,4,1,1,0.0,68.483412,75.024827,567,73,157,47
2,indonli,idkmrc,1,4,2,1,0.0,54.620853,62.089464,450,190,137,67
3,indonli,idkmrc,1,4,3,1,0.0,43.957346,51.956509,361,279,133,71
4,indonli,idkmrc,2,4,1,1,0.0,75.35545,83.421899,633,7,195,9
5,indonli,idkmrc,2,4,2,1,0.0,73.933649,82.221935,621,19,193,11
6,indonli,idkmrc,2,4,3,1,0.0,72.630332,80.936171,610,30,193,11
7,indonli_mnli,idkmrc,1,4,1,1,0.0,71.327014,78.022626,589,51,153,51
8,indonli_mnli,idkmrc,1,4,2,1,0.0,61.966825,69.508752,509,131,136,68
9,indonli_mnli,idkmrc,1,4,3,1,0.0,52.606635,60.545604,431,209,132,72


# Answerable only

In [10]:
data = "idkmrc"
variation = "answerable"
url_list = take_all_url_nli_validator(data)
url_list_df = take_all_url_nli_validator(data, df=True)
table_answerable = pd.DataFrame(create_table_nli_validator(url_list, url_list_df, data, variation=variation))
#table_answerable.to_excel(f'{data}-nli-validator.xlsx', index=False)
table_answerable

Amount of url: 49
Amount of url: 49


Processing first URLs: 100%|███████████████████████████████████████████████████████| 49/49 [00:00<00:00, 196107.73URL/s]
Processing second URLs: 100%|██████████████████████████████████████████████████████████| 49/49 [00:35<00:00,  1.38URL/s]


Unnamed: 0,MSC,data,TQ,TS,MSI,VA,TH,EM,F1,RA,RR,WA,WR
0,Baseline,idkmrc,BL,BL,BL,BL,BL,59.004739,75.848537,249,0,173,0
1,indonli,idkmrc,1,4,1,1,0.0,41.706161,54.788991,176,73,137,36
2,indonli,idkmrc,1,4,2,1,0.0,42.180095,57.117317,176,73,117,56
3,indonli,idkmrc,1,4,3,1,0.0,42.180095,58.178422,176,73,113,60
4,indonli,idkmrc,2,4,1,1,0.0,57.345972,73.478868,242,7,167,6
5,indonli,idkmrc,2,4,2,1,0.0,57.345972,73.922543,242,7,165,8
6,indonli,idkmrc,2,4,3,1,0.0,57.345972,73.957649,242,7,165,8
7,indonli_mnli,idkmrc,1,4,1,1,0.0,46.919431,60.310655,198,51,135,38
8,indonli_mnli,idkmrc,1,4,2,1,0.0,47.630332,62.714186,198,51,118,55
9,indonli_mnli,idkmrc,1,4,3,1,0.0,47.630332,63.508269,198,51,114,59


# Unanswerable only

In [11]:
data = "idkmrc"
variation = "unanswerable"
url_list = take_all_url_nli_validator(data)
url_list_df = take_all_url_nli_validator(data, df=True)
table_unanswerable = pd.DataFrame(create_table_nli_validator(url_list, url_list_df, data, variation=variation))
#table_unanswerable.to_excel(f'{data}-nli-validator.xlsx', index=False)
table_unanswerable

Amount of url: 49
Amount of url: 49


Processing first URLs: 100%|███████████████████████████████████████████████████████| 49/49 [00:00<00:00, 155579.79URL/s]
Processing second URLs: 100%|██████████████████████████████████████████████████████████| 49/49 [00:32<00:00,  1.49URL/s]


Unnamed: 0,MSC,data,TQ,TS,MSI,VA,TH,EM,F1,RA,RR,WA,WR
0,Baseline,idkmrc,BL,BL,BL,BL,BL,92.654028,92.654028,391,0,31,0
1,indonli,idkmrc,1,4,1,1,0.0,95.260664,95.260664,391,0,20,11
2,indonli,idkmrc,1,4,2,1,0.0,67.061611,67.061611,274,117,20,11
3,indonli,idkmrc,1,4,3,1,0.0,45.734597,45.734597,185,206,20,11
4,indonli,idkmrc,2,4,1,1,0.0,93.364929,93.364929,391,0,28,3
5,indonli,idkmrc,2,4,2,1,0.0,90.521327,90.521327,379,12,28,3
6,indonli,idkmrc,2,4,3,1,0.0,87.914692,87.914692,368,23,28,3
7,indonli_mnli,idkmrc,1,4,1,1,0.0,95.734597,95.734597,391,0,18,13
8,indonli_mnli,idkmrc,1,4,2,1,0.0,76.303318,76.303318,311,80,18,13
9,indonli_mnli,idkmrc,1,4,3,1,0.0,57.582938,57.582938,233,158,18,13


In [None]:
#print(f"{data} sorted by F1")
#sorted_table = table.sort_values(by='F1', ascending=False)
#sorted_table