In [5]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, BertForMultipleChoice
import torch
from tqdm import tqdm
tqdm.pandas()

In [16]:
# Load the finetuned model and tokenizer (from checkpoint with 88.38% accuracy on Coveo's Data)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased-finetuned-coveo/checkpoint-1198",  use_fast=True)
model = BertForMultipleChoice.from_pretrained("bert-base-uncased-finetuned-coveo/checkpoint-1198/")

In [2]:
df_test=pd.read_csv('multiple_choice/test.csv')

In [13]:
# Function to preprocess choices and make predictions for each row
def predict_choices(row):
    text = row['text']
    choices = [row[f'choice{i}'] for i in range(1, 7)]
    non_nan_choices = [choice for choice in choices if pd.notna(choice)]
    
    # If all choices are NaN, set all probabilities to 0
    if not non_nan_choices:
        return [0.0] * 6
    
    labels = torch.tensor(0).unsqueeze(0)  # Assuming choice1 is the correct choice (you can adjust it accordingly)

    encoding = tokenizer([text] * len(non_nan_choices), non_nan_choices, return_tensors="pt",truncation=True, max_length=512, padding=True)
    outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)

    logits = outputs.logits
    probabilities = logits.softmax(dim=1)[0].tolist()
    
    # Fill in the probabilities for non-NaN choices
    result_probabilities = []
    for choice in choices:
        if pd.notna(choice):
            result_probabilities.append(probabilities.pop(0))
        else:
            result_probabilities.append(0.0)
    
    # If there are fewer than 6 non-NaN choices, fill the rest with 0 probabilities
    result_probabilities += [0.0] * (6 - len(result_probabilities))
    
    return result_probabilities

In [None]:
# Sample DataFrame for demonstration

input_df = df_test

# Create a new DataFrame with the predicted probabilities for each choice
result_df = pd.DataFrame(input_df.progress_apply(predict_choices, axis=1).tolist(), columns=['choice1', 'choice2', 'choice3', 'choice4', 'choice5', 'choice6'])

# Concatenate the 'idx' column from the input DataFrame to the result DataFrame
result_df = pd.concat([input_df['idx'], result_df], axis=1)

 36%|███▋      | 5353/14729 [3:47:34<3:53:05,  1.49s/it] 

In [None]:
file_path = "results/result-checkpoint-1198.csv"  
result_df.to_csv(file_path, index=False)  

In [None]:
result_df