In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import transformers
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
access_token = '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'

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")
model = AutoModelForSequenceClassification.from_pretrained("huudung141434/deberta-legal-nli", token=access_token)


In [None]:
tdf = pd.read_excel('testset_NLI_LegalLens.xlsx')
tdf = tdf.assign(label='None')
tdf.head()

In [None]:
label_mapping = {0 : 'Contradict', 1 : 'Entailed', 2 : 'Neutral'}

In [None]:
class LegalLensDataset(Dataset):
    def __init__(self, data, tokenizer, max_len = 512, num_labels = 3):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __getitem__(self, idx):
        try:
            item = self.data.iloc[idx]
            pre = item.premise
            hypo = item.hypothesis
            encoded_input = self.tokenizer(pre, hypo, padding = 'max_length',
                                truncation = True, max_length = self.max_len, 
                                           return_tensors = 'pt')
            return {'input_ids' : encoded_input['input_ids'].squeeze(),
                  'token_type_ids' : encoded_input['token_type_ids'].squeeze(),
                  'attention_mask' : encoded_input['attention_mask'].squeeze()}
        except:
            traceback.print_exc()
            return None
    def __len__(self):
        return len(self.data)

In [None]:
from tqdm import tqdm
def infer_deberta(model, tdf, dataset, device):
    df = tdf.copy()
    model = model.to(device)
    model.eval()
    all_preds = []
    total_samples = 0
    for item in tqdm(dataset):
        premise = item['premise']
        hypothesis = item['hypothesis']
        inputs = tokenizer(premise, hypothesis, padding = 'max_length',truncation = True, max_length = 512, return_tensors = 'pt')
        inputs = inputs.to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)
        all_preds.append(prediction.item())
    for i in range(len(df)):
        df.label.at[i] = label_mapping[all_preds[i]]
    return df
        
def save_output(df, name):
    df.to_csv(f'{name}.csv')

In [None]:
from datasets import Dataset
test_dataset = Dataset.from_pandas(tdf)

In [None]:
out_df = infer_deberta(model, tdf, test_dataset, device)

In [None]:
out_df.head()

In [None]:
out_df.to_csv('predictions_NLILens.csv', index = False)

In [None]:
def check_nli_format(predictions_file_path, test_file_path):
    """
    Check the format of the NLI prediction file.
    The file should be in CSV format with columns: Premise, hypothesis, label
    """
    try:
        df = pd.read_csv(predictions_file_path)
    except Exception as e:
        return False, f"Error reading predictions CSV file: {e}"
    
    try:
        test_df = pd.read_csv(test_file_path)
    except Exception as e:
        return False, f"Error reading test CSV file: {e}"
    
    # Check expected columns
    expected_columns = ['premise', 'hypothesis', 'label']
    pred_columns = list(df.columns)
    for expected_col in expected_columns:
        if expected_col not in pred_columns:
            return False, f"Incorrect columns. Expected: {expected_columns}, Found: {pred_columns}"
    
    # Check number of rows
    expected_nli_num_rows = len(test_df)
    predictions_nli_num_rows = len(df)
    if predictions_nli_num_rows != expected_nli_num_rows:
        return False, f"Incorrect number of predictions. Expected: {expected_nli_num_rows}, Found: {predictions_nli_num_rows}"
    
    return True, "NLI prediction file format is correct."

In [None]:
is_valid, message = check_nli_format('predictions_NLILens.csv', 'testset_NLI_LegalLens.csv')
print(f"NLI File Check: {message}")