In [1]:
!nvidia-smi

Wed Sep 13 04:11:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   33C    P0    43W / 300W |      3MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |      3MiB / 32510MiB |      0%      Default |
|       

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6'

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd
import numpy as np
import requests, string, re, collections, math
from tqdm import tqdm
from transformers import pipeline

# Download output of baseline

In [None]:
def remove_punctuation(text):
    return text.strip(string.punctuation)

def return_overlap_between_gold_and_pred(data):
    
    data['is_overlap'] = ""
    overlap_data_array = []
    
    for i in tqdm(range(len(data))):
        
        pred_answer = data['Prediction Answer'][i]
        gold_answer = data['Gold Answer'][i]
        
        if type(pred_answer) != str and math.isnan(pred_answer): pred_answer = ""
        if type(gold_answer) != str and math.isnan(gold_answer): gold_answer = ""
        
        pred_answer = remove_punctuation(str(pred_answer).lower().strip())
        gold_answer = remove_punctuation(str(gold_answer).lower().strip())
        
        pred_answer_words = set(remove_punctuation(text) for text in pred_answer.split())
        gold_answer_words = set(remove_punctuation(text) for text in gold_answer.split())
        
        if pred_answer_words.intersection(gold_answer_words):
            if pred_answer == gold_answer:
                overlap_data_array.append(False)
            else:
                overlap_data_array.append(True)
        else:
            overlap_data_array.append(False)
            
    data['is_overlap'] = overlap_data_array
    return data

In [None]:
def convert_question_and_answer_to_hypothesis(data):
    for i in tqdm(range(len(data))):
        data['Prediction Hypothesis'] = data['Question'] + ' ' + data['Prediction Answer']
        data['Gold Hypothesis'] = data['Question'] + ' ' + data['Gold Answer']
    return data

In [None]:
def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def return_acc_and_f1(data, message):
    em = 0
    f1_arr = []
    
    for i in tqdm(range(len(data))):
        
        pred = str(data['Prediction Answer'][i])
        gold = str(data['Gold Answer'][i])
        
        # for exact match
        if pred == gold:
            em += 1
        
        # for f1
        pred_tokens = normalize_text(pred).split()
        gold_tokens = normalize_text(gold).split()
        common = collections.Counter(pred_tokens) & collections.Counter(gold_tokens)
        num_same = sum(common.values()) # True positive

        if len(gold_tokens) == 0 or len(pred_tokens) == 0: 
            f1 = int(gold_tokens == pred_tokens)
            f1_arr.append(f1)
            continue

        if num_same == 0:
            f1 = 0
            f1_arr.append(f1)
            continue

        precision = 1.0 * num_same / len(pred_tokens)
        recall = 1.0 * num_same / len(gold_tokens)
        f1 = (2.0 * precision * recall) / (precision + recall)
        f1_arr.append(f1)
        
    em_final = em / len(data)
    f1_final = np.mean(f1_arr)
    
    print(message.upper())
    print("Final exact match:", round(em_final, 3))
    print("Average F1 score:", round(f1_final, 3))

In [None]:
df_squadid = pd.read_csv("output_squadid_df.csv")
df_squadid = df_squadid[["Context", "Question", "Prediction Answer", "Gold Answer"]]
df_squadid = convert_question_and_answer_to_hypothesis(df_squadid)
df_squadid = df_squadid.dropna(subset=['Gold Answer']).reset_index(drop=True)
df_squadid = return_overlap_between_gold_and_pred(df_squadid)
print(df_squadid['Prediction Answer'].isna().sum())
df_squadid

In [None]:
return_acc_and_f1(df_squadid, "squad-id")

In [None]:
url = "https://huggingface.co/muhammadravi251001/fine-tuned-DatasetQAS-IDK-MRC-with-xlm-roberta-large-without-ITTL-without-freeze-LR-1e-05/raw/main/results/output/output_df.csv"
response = requests.get(url)

if response.status_code == 200:
    with open("data.csv", "wb") as file:
        file.write(response.content)
    df_idkmrc = pd.read_csv("data.csv")
else: print("Download failed!")
    
df_idkmrc = df_idkmrc[["Context", "Question", "Prediction Answer", "Gold Answer"]]
df_idkmrc = convert_question_and_answer_to_hypothesis(df_idkmrc)
df_idkmrc = df_idkmrc.dropna(subset=['Gold Answer']).reset_index(drop=True)
df_idkmrc = return_overlap_between_gold_and_pred(df_idkmrc)
print(df_idkmrc['Prediction Answer'].isna().sum())
df_idkmrc

In [None]:
return_acc_and_f1(df_idkmrc, "idk-mrc")

In [None]:
url = "https://huggingface.co/muhammadravi251001/fine-tuned-DatasetQAS-TYDI-QA-ID-with-xlm-roberta-large-without-ITTL-without-freeze-LR-1e-05/raw/main/results/output/output_df.csv"
response = requests.get(url)

if response.status_code == 200:
    with open("data.csv", "wb") as file:
        file.write(response.content)
    df_tydiqaid = pd.read_csv("data.csv")
else: print("Download failed!")
    
df_tydiqaid = df_tydiqaid[["Context", "Question", "Prediction Answer", "Gold Answer"]]
df_tydiqaid = convert_question_and_answer_to_hypothesis(df_tydiqaid)
df_tydiqaid = df_tydiqaid.dropna(subset=['Gold Answer']).reset_index(drop=True)
df_tydiqaid = return_overlap_between_gold_and_pred(df_tydiqaid)
print(df_tydiqaid['Prediction Answer'].isna().sum())
df_tydiqaid

In [None]:
return_acc_and_f1(df_tydiqaid, "tydi-qa-id")

# Count an overlapping pred answer and gold answer

In [None]:
print("SQuAD-ID")
print(df_squadid['is_overlap'].value_counts())
print()

print("IDK-MRC")
print(df_idkmrc['is_overlap'].value_counts())
print()

print("TyDI-QA-ID")
print(df_tydiqaid['is_overlap'].value_counts())

## Deleting row if is_overlap is True

In [None]:
df_squadid = df_squadid[df_squadid['is_overlap'] == False].reset_index(drop=True)
df_idkmrc = df_idkmrc[df_idkmrc['is_overlap'] == False].reset_index(drop=True)
df_tydiqaid = df_tydiqaid[df_tydiqaid['is_overlap'] == False].reset_index(drop=True)

# Test the hypothesis above with newest NLI model

In [None]:
tokenizer_kwargs = {'truncation': True, 'max_length': 512}
model_nli_name = "muhammadravi251001/fine-tuned-NLI-idk-mrc-nli-keep-with-xlm-roberta-large"
#model_nli_name = "muhammadravi251001/fine-tuned-IndoNLI-Augmented-with-xlm-roberta-large-LR-1e-05"
#model_nli_name = "muhammadravi251001/fine-tuned-NLI-multilingual-with-xlm-roberta-large"
nli_model = pipeline("text-classification", model=model_nli_name, tokenizer=model_nli_name, **tokenizer_kwargs)

In [None]:
nli_model({'text': "Bambang Pamungkas seorang pemain bola asal Jakarta", 
           'text_pair': "Bambang Pamungkas berasal dari Jakarta"})

In [None]:
nli_model({'text': "Bambang Pamungkas seorang pemain bola asal Jakarta", 
           'text_pair': "Bambang Pamungkas bukan seorang pemain bola"})

In [None]:
nli_model({'text': "Bambang Pamungkas seorang pemain bola asal Jakarta", 
           'text_pair': "Bambang Pamungkas berasal dari Bandung"})

In [None]:
def add_label(data, message, nli_model=nli_model):
    
    data["Label from Prediction Answer"] = str()
    data["Label from Gold Answer"] = str()
    
    labels_pred_answer = []
    labels_gold_answer = []
    
    for i in tqdm(range(len(data))):
        
        premise = data['Context'][i]
        pred_hypo = data['Prediction Hypothesis'][i]
        gold_hypo = data['Gold Hypothesis'][i]
        
        if type(pred_hypo) != str and math.isnan(pred_hypo): pred_hypo = ""
        if type(gold_hypo) != str and math.isnan(gold_hypo): gold_hypo = ""
        
        label_pred_answer = nli_model({'text': premise, 'text_pair': pred_hypo})['label']
        labels_pred_answer.append(label_pred_answer)
        
        label_gold_answer = nli_model({'text': premise, 'text_pair': gold_hypo})['label']
        labels_gold_answer.append(label_gold_answer)
    
    data["Label from Prediction Answer"] = labels_pred_answer
    data["Label from Gold Answer"] = labels_gold_answer
    
    total_entailment_gold = data['Label from Gold Answer'].str.count('entailment').sum()
    total_neutral_gold = data['Label from Gold Answer'].str.count('neutral').sum()
    total_contradiction_gold = data['Label from Gold Answer'].str.count('contradiction').sum()
    
    total_entailment_pred = data['Label from Prediction Answer'].str.count('entailment').sum()
    total_neutral_pred = data['Label from Prediction Answer'].str.count('neutral').sum()
    total_contradiction_pred = data['Label from Prediction Answer'].str.count('contradiction').sum()
    
    total_label_gold = len(data[data['Label from Gold Answer'] != "NULL"])
    total_label_pred = len(data[data['Label from Prediction Answer'] != "NULL"])
    
    print(f"PREDICTION {message.upper()}")
    print(f"Total entailment: {total_entailment_pred} ({round(total_entailment_pred/total_label_pred, 2) * 100} %)")
    print(f"Total neutral: {total_neutral_pred} ({round(total_neutral_pred/total_label_pred, 2) * 100} %)")
    print(f"Total contradiction: {total_contradiction_pred} ({round(total_contradiction_pred/total_label_pred, 2) * 100} %)")
    print()
    print(f"GOLD {message.upper()}")
    print(f"Total entailment: {total_entailment_gold} ({round(total_entailment_gold/total_label_gold, 2) * 100} %)")
    print(f"Total neutral: {total_neutral_gold} ({round(total_neutral_gold/total_label_gold, 2) * 100} %)")
    print(f"Total contradiction: {total_contradiction_gold} ({round(total_contradiction_gold/total_label_gold, 2) * 100} %)")
    print()
    print(f"Total data: {len(data)}")
    
    return data

In [None]:
df_idkmrc = add_label(df_idkmrc, "idk-mrc")
df_idkmrc

In [None]:
df_tydiqaid = add_label(df_tydiqaid, "tydi-qa-id")
df_tydiqaid

In [None]:
#df_squadid = add_label(df_squadid, "squad-id")
#df_squadid

In [None]:
def count_by_answer_and_label(data, message):
    
    data['properties'] = str()
    properties = []
    
    right_answer_and_entailment = 0
    right_answer_and_neutral = 0
    right_answer_and_contradiction = 0
    
    wrong_answer_and_entailment = 0
    wrong_answer_and_neutral = 0
    wrong_answer_and_contradiction = 0
    
    for i in tqdm(range(len(data))):
        
        pred_answer = data['Prediction Answer'][i]
        gold_answer = data['Gold Answer'][i]
        
        label_from_pred_answer = data['Label from Prediction Answer'][i]
        
        # For right answer and entailment label
        if pred_answer == gold_answer and label_from_pred_answer == 'entailment':
            right_answer_and_entailment += 1
            properties.append("Right answer and entailment label")
        
        # For right answer but neutral label
        elif pred_answer == gold_answer and label_from_pred_answer == 'neutral':
            right_answer_and_neutral += 1
            properties.append("Right answer and neutral label")
        
        # For right answer but contradiction label
        elif pred_answer == gold_answer and label_from_pred_answer == 'contradiction':
            right_answer_and_contradiction += 1
            properties.append("Right answer and contradiction label")
            
        
        
        # For wrong answer but entailment label
        elif pred_answer != gold_answer and label_from_pred_answer == 'entailment':
            wrong_answer_and_entailment += 1
            properties.append("Wrong answer and entailment label")
        
        # For wrong answer but neutral label
        elif pred_answer != gold_answer and label_from_pred_answer == 'neutral':
            wrong_answer_and_neutral += 1
            properties.append("Wrong answer and neutral label")
            
        # For wrong answer and contradiction label
        elif pred_answer != gold_answer and label_from_pred_answer == 'contradiction':
            wrong_answer_and_contradiction += 1
            properties.append("Wrong answer and contradiction label")
    
    data['properties'] = properties
    
    total_right_answer = right_answer_and_entailment + right_answer_and_neutral + right_answer_and_contradiction
    total_wrong_answer = wrong_answer_and_entailment + wrong_answer_and_neutral + wrong_answer_and_contradiction
    
    print(message.upper())
    
    if total_right_answer != 0:
        print(f"Right answer and Prediction hypothesis entailment label: {right_answer_and_entailment} ({round(right_answer_and_entailment/total_right_answer, 2) * 100}) %")
        print(f"Right answer and Prediction hypothesis neutral label: {right_answer_and_neutral} ({round(right_answer_and_neutral/total_right_answer, 2) * 100}) %")
        print(f"Right answer and Prediction hypothesis contradiction label: {right_answer_and_contradiction} ({round(right_answer_and_contradiction/total_right_answer, 2) * 100}) %")
    else:
        print(f"Right answer and Prediction hypothesis entailment label: 0 (0) %")
        print(f"Right answer and Prediction hypothesis neutral label: 0 (0) %")
        print(f"Right answer and Prediction hypothesis contradiction label: 0 (0) %")
    
    print()
    
    if total_wrong_answer != 0:
        print(f"Wrong answer and Prediction hypothesis entailment label: {wrong_answer_and_entailment} ({round(wrong_answer_and_entailment/total_wrong_answer, 2) * 100}) %")
        print(f"Wrong answer and Prediction hypothesis neutral label: {wrong_answer_and_neutral} ({round(wrong_answer_and_neutral/total_wrong_answer, 2) * 100}) %")
        print(f"Wrong answer and Prediction hypothesis contradiction label: {wrong_answer_and_contradiction} ({round(wrong_answer_and_contradiction/total_wrong_answer, 2) * 100}) %")
    
    else:
        print(f"Wrong answer and Prediction hypothesis entailment label: 0 (0) %")
        print(f"Wrong answer and Prediction hypothesis neutral label: 0 (0) %")
        print(f"Wrong answer and Prediction hypothesis contradiction label: 0 (0) %")
    
    return data

In [None]:
df_idkmrc = count_by_answer_and_label(df_idkmrc, "idk-mrc")
df_idkmrc

In [None]:
df_tydiqaid = count_by_answer_and_label(df_tydiqaid, "tydi-qa-id")
df_tydiqaid

In [None]:
#df_squadid = count_by_answer_and_label(df_squadid, "squad-id")
#df_squadid