In [1]:
import os,random,math,sys
import multiprocessing
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm import tqdm
device='cuda' if torch.cuda.is_available() else 'cpu'

parentdir='data/raw_data'
TRAINING_DIR=parentdir+'/Holmes_Training_Data'

  return f(*args, **kwds)


In [3]:
test_data=pd.read_csv(os.path.join(parentdir,'testing_data.csv'),index_col=0)
test_answer=pd.read_csv(os.path.join(parentdir,'test_answer.csv'),index_col=0).iloc[:,0]

In [4]:
test_data.head()

Unnamed: 0_level_0,question,a),b),c),d),e)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,I have it from the same source that you are bo...,crying,instantaneously,residing,matched,walking
2,It was furnished partly as a sitting and partl...,daintily,privately,inadvertently,miserably,comfortably
3,"As I descended , my old ally , the _____ , cam...",gods,moon,panther,guard,country-dance
4,"We got off , _____ our fare , and the trap rat...",rubbing,doubling,paid,naming,carrying
5,"He held in his hand a _____ of blue paper , sc...",supply,parcel,sign,sheet,chorus


# Pre-trained

In [5]:
answers={
    0:'a',
    1:'b',
    2:'c',
    3:'d',
    4:'e'
}

def get_options_indices(tokenizer,prefix,options):
    """
    Converting options' text into id of the tokenizer

    Arguments:
    - tokenizer: PretrainedTokenizer
    - options: list

    Return: indices of each option's text (list)
    """
    indices=[tokenizer(option)['input_ids'][1:-1] for option in options]
    for i,option in enumerate(options):
        if prefix+option in tokenizer.vocab.keys():
            indices[i]=[tokenizer.convert_tokens_to_ids(prefix+option)]
    return indices

cos = nn.CosineSimilarity(dim=0, eps=1e-6)

class pretrained_model_tester():
    def __init__(self,test_df,model_checkpoint):
        self.tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)
        self.model=AutoModelForMaskedLM.from_pretrained(model_checkpoint).to(device)
        self.model.eval()
        self.test_data=test_df.copy()
        if 'roberta' in model_checkpoint:
            self.test_data.question=self.test_data.question.str.replace('_____','<mask>')
            self.embedding_weight=self.model.roberta.embeddings.word_embeddings.weight
            self.prefix='Ġ'
        else:
            self.test_data.question=self.test_data.question.str.replace('_____','[MASK]')
            self.embedding_weight=self.model.bert.embeddings.word_embeddings.weight
            self.prefix=''

    def predict(self,question,options,result_method,pooling_method):
        """
        Perform sentence splitting, tokenizing, applying model and get result.

        Arguments:
        - question: string
            text with masked token
        - options: list of string
            option to be chosen
        - result_method: string
            `base_only` (use only the first token of the option) or `all` (use every token)
        - pooling_method: string
            `sum` (summing all the embedding of tokens together) or `mean` (averaging all the embedding of tokens together)

        Return: index of the option chosen (int)
        """
        # tokenizing
        inputs=self.tokenizer(question,return_tensors='pt')
        # move to gpu if available
        inputs={key: value.to(device) for key, value in inputs.items()}
        # misc
        masked_index=np.where((inputs['input_ids']==self.tokenizer.mask_token_id).cpu())[1][0]
        options_indices=get_options_indices(self.tokenizer,self.prefix,options)
        # get result
        with torch.no_grad():
            outputs=self.model(**inputs)[0]
            if result_method=='base_only':
                options_indices=[option[0] for option in options_indices]
                outputs=outputs[:,masked_index,options_indices]
                return torch.argmax(outputs).item()
            elif result_method=='all':
                predicted_index=torch.argmax(outputs[0, masked_index]).item()
                predicted_embedding=self.embedding_weight[predicted_index,:]
                similarity=[]
                for indices in options_indices:
                    if pooling_method=='mean':
                        similarity.append(cos(predicted_embedding,self.embedding_weight[indices,:].mean(axis=0)).item())
                    elif pooling_method=='max':
                        similarity.append(cos(predicted_embedding,self.embedding_weight[indices,:].max(axis=0)[0]).item())
                    elif pooling_method=='min':
                        similarity.append(cos(predicted_embedding,self.embedding_weight[indices,:].min(axis=0)[0]).item())
                    else:
                        raise TypeError('pooling_method must be `mean`, `max` or `min` when result_method is `all`')
                return np.argmax(similarity)
            else:
                raise TypeError('result_method must be either `base_only` or `all`')
    
    def batch_predict(self,result_method,pooling_method='sum'):
        """
        Perform prediction on the whole test df

        Arguments:
        - result_method: string
            `base_only` (use only the first token of the option) or `all` (use every token)
        - pooling_method: string
            `sum` (summing all the embedding of tokens together) or `mean` (averaging all the embedding of tokens together)

        Return: answers of the questions (pandas series) 
        """
        result=[]
        for index, row in self.test_data.iterrows():
            question=row[0]
            options=row[1:].tolist()
            predicted_ind=self.predict(question,options,result_method,pooling_method)
            result.append(answers[predicted_ind])
        return pd.Series(result,index=test_data.index)

def get_accuracy(prediction,labels):
    return sum(prediction==labels)/len(labels)

In [None]:
params={
    'model_checkpoints':['bert-base-uncased','bert-base-cased','bert-large-uncased','bert-large-cased','roberta-base','roberta-large'],
    'result_method':['base_only','all'],
    'pooling_method':['max','min','mean']
}

col_names=['model_checkpoints','result_method','pooling_method','accuracy']

result=[]
for model_checkpoint in params['model_checkpoints']:
    pretrained_model=pretrained_model_tester(test_data,model_checkpoint)
    for result_method in params['result_method']:
        if result_method=='base_only':
            predictions=pretrained_model.batch_predict(result_method,None)
            result.append([model_checkpoint,result_method,None,get_accuracy(predictions,test_answer)])
        else:
            for pooling_method in params['pooling_method']:
                predictions=pretrained_model.batch_predict(result_method,pooling_method)
                result.append([model_checkpoint,result_method,pooling_method,get_accuracy(predictions,test_answer)])

result=pd.DataFrame(result,columns=col_names)
display(result)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model f

Unnamed: 0,model_checkpoints,result_method,pooling_method,accuracy
0,bert-base-uncased,base_only,,0.75
1,bert-base-uncased,all,max,0.490385
2,bert-base-uncased,all,min,0.451923
3,bert-base-uncased,all,mean,0.475
4,bert-base-cased,base_only,,0.701923
5,bert-base-cased,all,max,0.526923
6,bert-base-cased,all,min,0.549038
7,bert-base-cased,all,mean,0.55
8,bert-large-uncased,base_only,,0.788462
9,bert-large-uncased,all,max,0.539423
