### Produce C N E scores using RoBERTa

In [None]:
import torch
from fairseq.data.data_utils import collate_tokens
import pandas as pd

# load roberta model.
torch.hub.set_dir('../../roberta/hub')
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # loading this way works.
roberta.eval()

In [24]:
output_map = {
    0: 'contradiction',
    1: 'neutral',
    2: 'entailment'
}

In [4]:
# set paths and parameters.
data_dir = input("enter path to csv/tsv file:\n")
delim = input("enter delimiter\n")

with open(data_dir) as f:
    data_df = pd.read_csv(f, delimiter=delim)

print(data_df.head())

batch_size = int(input("enter batch size (integer):\n"))

enter path to csv/tsv file:
 ../data/Astro1_decomposed_clean_pairs.csv
enter delimiter
 ,
enter save path

 ../data/roberta/Astro1_decomposed_clean_pairs_classes.csv


   Unnamed: 0               first  \
0           0  You never give up.   
1           1  You never give up.   
2           2  You never give up.   
3           3  You never give up.   
4           4  You never give up.   

                                              second  
0  You never find it difficult to change your min...  
1  Whatever you have set your sights on, you refu...  
2  You are patient unless someone takes you too far.  
3  You are usually slow to anger unless someone t...  
4                                  You are reliable.  


enter batch size (integer):
 5


In [33]:
import numpy as np
from math import exp

results = list()

for j in range(1, int(len(data_df)/batch_size + 1)):
    batch_of_pairs = []
    for i in range(j*batch_size-batch_size, j*batch_size):
        batch_of_pairs.append([data_df.iloc[i]['first'], data_df.iloc[i]['second']])

    batch = collate_tokens(
        [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
    )
    logprobs0 = roberta.predict('mnli', batch)
    classes_tsr0 = logprobs0.argmax(dim=1)
    classes_arr0 = classes_tsr0.numpy()
    
    # other direction.
    batch = collate_tokens(
        [roberta.encode(pair[1], pair[0]) for pair in batch_of_pairs], pad_idx=1
    )
    logprobs1 = roberta.predict('mnli', batch)
    classes_tsr1 = logprobs1.argmax(dim=1)
    classes_arr1 = classes_tsr1.numpy()
    
    for k in range(len(classes_arr0)):
        max0 = logprobs0[k][classes_arr0[k]].item()
        max1 = logprobs1[k][classes_arr1[k]].item()
        max_dir = np.argmax([max0, max1])
        #print(f"max_dir: {max_dir}")
        if max_dir == 0:
            final_logprobs = logprobs0[k][0], logprobs0[k][1], logprobs0[k][2]
        else:
            final_logprobs = logprobs1[k][0], logprobs1[k][1], logprobs1[k][2]

        final_logprobs_list = [logprob.item() for logprob in final_logprobs]
        prob_list =  [pow(exp(1), logprob) for logprob in final_logprobs_list]
        #print(prob_list)
        predicted_classnum = np.argmax(prob_list)
        #print(predicted_classnum)
        row = [output_map[predicted_classnum], prob_list[0], prob_list[1], prob_list[2]]
        results.append(row)

remainder = len(data_df) % batch_size

if remainder > 0:
    batch_of_pairs = []
    for i in range(len(data_df) - remainder, len(data_df)):
        batch_of_pairs.append([data_df.iloc[i]['first'], data_df.iloc[i]['second']])
        
    batch = collate_tokens(
        [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
    )
    logprobs0 = roberta.predict('mnli', batch)
    classes_tsr0 = logprobs0.argmax(dim=1)
    classes_arr0 = classes_tsr0.numpy()
    
    # other direction.
    batch = collate_tokens(
        [roberta.encode(pair[1], pair[0]) for pair in batch_of_pairs], pad_idx=1
    )
    logprobs1 = roberta.predict('mnli', batch)
    classes_tsr1 = logprobs1.argmax(dim=1)
    classes_arr1 = classes_tsr1.numpy()
    
    for k in range(len(classes_arr0)):
        max0 = logprobs0[k][classes_arr0[k]].item()
        max1 = logprobs1[k][classes_arr1[k]].item()
        max_dir = np.argmax([max0, max1])
        #print(f"max_dir: {max_dir}")
        if max_dir == 0:
            final_logprobs = logprobs0[k][0], logprobs0[k][1], logprobs0[k][2]
        else:
            final_logprobs = logprobs1[k][0], logprobs1[k][1], logprobs1[k][2]

        final_logprobs_list = [logprob.item() for logprob in final_logprobs]
        prob_list =  [pow(exp(1), logprob) for logprob in final_logprobs_list]
        #print(prob_list)
        predicted_classnum = np.argmax(prob_list)
        #print(predicted_classnum)
        row = [output_map[predicted_classnum], prob_list[0], prob_list[1], prob_list[2]]
        results.append(row)
            
    print(f'saved {remainder} predictions to {save_path}')

results_df = pd.DataFrame(data=results, columns=["predicted_class", "C", "N", "E"])
display(results_df.head(), results_df.tail())

saved 1 predictions to ../data/Astro1_decomposed_clean_pairs_roberta.csv


Unnamed: 0,predicted_class,C,N,E
0,neutral,0.022798,0.880442,0.09676
1,entailment,0.001746,0.039801,0.958454
2,neutral,0.004882,0.990273,0.004845
3,neutral,0.05205,0.944104,0.003846
4,entailment,0.001493,0.161846,0.836661


Unnamed: 0,predicted_class,C,N,E
1076,entailment,0.042239,0.253214,0.704547
1077,contradiction,0.65543,0.302836,0.041734
1078,neutral,0.102161,0.781847,0.115993
1079,entailment,0.000774,0.081088,0.918138
1080,contradiction,0.599679,0.19691,0.203411


In [35]:
save_path = input("enter save path for csv file:\n")
results_df.to_csv(save_path)

enter save path for csv file:
 ../data/roberta/Astro1_decomposed_clean_pairs_roberta.csv
