# Inference with NLI validation

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from io import StringIO
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

In [2]:
prefix = "https://huggingface.co/muhammadravi251001/fine-tuned-FilteringNLI-"
suffix = "/raw/main/results/output/output_df.csv"
suffix_squadid = "/resolve/main/results/output/output_df.csv"

In [3]:
data = "idkmrc"

MODEL_SC_NAME_1 = f"muhammadravi251001/fine-tuned-NLI-indonli-with-xlm-roberta-large"
MODEL_SC_NAME_2 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli-with-xlm-roberta-large"
MODEL_SC_NAME_3 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli_{data}-nli-with-xlm-roberta-large"

tokenizer_sc_1 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_1)
model_sc_1 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_1)

tokenizer_sc_2 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_2)
model_sc_2 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_2)

tokenizer_sc_3 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_3)
model_sc_3 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_3)

In [4]:
def take_dataframe(url): 
    
    response = requests.get(url)
    
    if response.status_code == 200:
        csv_data = StringIO(response.text)
        df = pd.read_csv(csv_data, index_col=0)
    
    else:
        print("Failed to download CSV")
    
    return df

In [5]:
def nlp_sc(text_dict, model_num):
    
    if model_num == 1:
        tokenizer_sc = tokenizer_sc_1
        model_sc = model_sc_1
    
    elif model_num == 2:
        tokenizer_sc = tokenizer_sc_2
        model_sc = model_sc_2
   
    elif model_num == 3:
        tokenizer_sc = tokenizer_sc_3
        model_sc = model_sc_3
    
    tokenizer_kwargs = {'truncation': True, 'max_length': 512}
    
    inputs = tokenizer_sc(text_dict['text'], text_dict['text_pair'], 
                          return_tensors="pt",
                          **tokenizer_kwargs)
    
    outputs = model_sc(**inputs)

    label_id = torch.argmax(outputs.logits).item()
    label = model_sc.config.id2label[label_id]
    score = outputs.logits.softmax(dim=-1)[0][label_id].item()

    return {'label': label, 'score': score}

In [6]:
def gold_label(df, model_num):
    
    entailment = 0
    neutral = 0
    contradiction = 0
    
    for i in tqdm(range(len(df))):
        
        label = nlp_sc({'text': df['Context'][i], 'text_pair': df['Gold Hypothesis'][i]}, model_num)["label"]
        
        if label == "entailment": entailment += 1
        elif label == "neutral": neutral += 1
        elif label == "contradiction": contradiction += 1
            
    return entailment, neutral, contradiction, len(df)

In [7]:
def show_nli_of_gold_answer(data, squadid=False):
    
    # Take output_df.csv from Hugging Face
    df = take_dataframe(f"{prefix}indonli_mnli-{data}-TQ2-TS4-MS3-VA0-TH0.0{suffix}")
    
    if squadid:
        df = take_dataframe(f"{prefix}indonli_mnli-{data}-TQ2-TS4-MS3-VA0-TH0.0{suffix_squadid}")
    
    label_1 = gold_label(df, 1)
    label_2 = gold_label(df, 2)
    label_3 = gold_label(df, 3)
    
    # Show table to Latex
    dataset_list = []
    dataset_list.extend([data] * 3)

    df_nli_label_gold_answer = pd.DataFrame(
        {

        'Model': ["INLI", "INLI_MNLI", "INLI_MNLI_AUG"],
        'Dataset': dataset_list,
        'Entailment': [label_1[0], label_2[0], label_3[0]],
        'Neutral': [label_1[1], label_2[1], label_3[1]],
        'Contradiction': [label_1[2], label_2[2], label_3[2]],
        'Total': [label_1[3], label_2[3], label_3[3]]
            
        }, 

        columns=['Model', 'Dataset', 'Entailment', 'Neutral', 'Contradiction', 'Total']
        )

    # Uncomment this to get latex code
    #print("NLI of gold answer")
    #print(df_score_em_f1.to_latex(index=False))
    
    return df_nli_label_gold_answer

In [None]:
data = "idkmrc"
table = pd.DataFrame(show_nli_of_gold_answer(data))
table

100%|█████████████████████████████████████████████████████████████████████████████████| 844/844 [12:51<00:00,  1.09it/s]
 32%|█████████████████████████▊                                                       | 269/844 [03:49<07:13,  1.33it/s]

In [None]:
data = "tydiqaid"

MODEL_SC_NAME_1 = f"muhammadravi251001/fine-tuned-NLI-indonli-with-xlm-roberta-large"
MODEL_SC_NAME_2 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli-with-xlm-roberta-large"
MODEL_SC_NAME_3 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli_{data}-nli-with-xlm-roberta-large"

tokenizer_sc_1 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_1)
model_sc_1 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_1)

tokenizer_sc_2 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_2)
model_sc_2 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_2)

tokenizer_sc_3 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_3)
model_sc_3 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_3)

In [None]:
data = "tydiqaid"
table = pd.DataFrame(show_nli_of_gold_answer(data))
table

In [None]:
data = "squadid"

MODEL_SC_NAME_1 = f"muhammadravi251001/fine-tuned-NLI-indonli-with-xlm-roberta-large"
MODEL_SC_NAME_2 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli-with-xlm-roberta-large"
MODEL_SC_NAME_3 = f"muhammadravi251001/fine-tuned-NLI-indonli_mnli_{data}-nli-with-xlm-roberta-large"

tokenizer_sc_1 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_1)
model_sc_1 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_1)

tokenizer_sc_2 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_2)
model_sc_2 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_2)

tokenizer_sc_3 = AutoTokenizer.from_pretrained(MODEL_SC_NAME_3)
model_sc_3 = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME_3)

In [None]:
data = "squadid"
table = pd.DataFrame(show_nli_of_gold_answer(data, squadid=True))
table

In [None]:
1+1