# Import Packages

In [1]:
import pandas as pd
import nltk
import torch
import re
from transformers import BertTokenizer, BertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM

# Load Models

In [7]:
bert_base_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_base_model = BertForMaskedLM.from_pretrained("bert-base-uncased")

bert_large_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
bert_large_model = BertForMaskedLM.from_pretrained("bert-large-uncased")

roberta_base_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_base_model = RobertaForMaskedLM.from_pretrained("roberta-base")

roberta_large_tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
roberta_large_model = RobertaForMaskedLM.from_pretrained("roberta-large")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mode

# Load Data

In [27]:
data = pd.read_csv("Data/transformed_data.csv", sep = ",")
data["masked_token"], data["predicted_token"], data["accuracy"] = "","",""
data["was_position"], data["were_position"] = '',''

In [28]:
#retrieve appropriate relative pronoun
for index, row in data.iterrows():
    if row["number"] == "singular":
        data.loc[index, "masked_token"] = "was"
    elif row["number"] == "plural":
        data.loc[index, "masked_token"] = "were"

In [11]:
data.head()

Unnamed: 0,id,original_sentence,transformed_sentence,relative_pronoun,number,N1,N2,exchanged_N2,masked_token,predicted_token,accuracy,was_position,were_position
0,0,A client looked at the clerk of the solicitor who,A client looked at the clerk of the offices wh...,who,singular,clerk,solicitor,office,was,,,,
1,1,A client noticed the hairdresser of the actres...,A client noticed the hairdresser of the salons...,who,singular,hairdresser,actress,salon,was,,,,
2,2,A customer frowned at the assistant of the pha...,A customer frowned at the assistant of the lab...,who,singular,assistant,pharmacist,laboratory,was,,,,
3,3,A fan looked at the guitarist of the singer who,A fan looked at the guitarist of the bands who...,who,singular,guitarist,singer,band,was,,,,
4,4,A man asked for the apprentice of the builder who,A man asked for the apprentice of the colleges...,who,singular,apprentice,builder,college,was,,,,


# Mask the Auxiliary

In [29]:
#insert masked token in the position of the auxiliary
#BERT and RoBERTa require different masked tokens
def mask_token(sent, model_type):
    
    tokens = nltk.word_tokenize(sent)
    tag = nltk.pos_tag(tokens)
    
    masked_token = " "
            
    for i in range(len(tag)):
        if tag[i][0] == "was" or tag[i][0] == "were":
            if tag[i-1][0] == "who" or tag[i-1][0] == "which":
                if model_type == "BERT":
                    tokens[i] = "[MASK]"
                elif model_type == "RoBERTa":
                    tokens[i] = "<mask>"
                
    tokens = " ".join(tokens)
    
    return tokens

In [30]:
#create separate datasets for BERT and RoBERTa
bert_data = data.copy()
roberta_data = data.copy()

#retrieve appropriate model type
bert_data["model"] = "BERT"
roberta_data["model"] = "RoBERTa"

In [31]:
#apply function to mask the auxiliary
bert_data["transformed_sentence"] = bert_data["transformed_sentence"].apply(
    mask_token, model_type = "BERT")
roberta_data["transformed_sentence"] = roberta_data["transformed_sentence"].apply(
    mask_token, model_type = "RoBERTa")

# Get Prediction of the Models for the Masked Token

In [45]:
#source: https://gist.github.com/yuchenlin/a2f42d3c4378ed7b83de65c7a2222eb2
#function to retrieve the top 500 predictions of the model for the masked token
def predict_masked_token(text, model_type, model, tokenizer, top_k=500):
    
    # Tokenize input depending on the model type
    if model_type == "BERT":
        text = "[CLS] %s [SEP]"%text
        tokenized_text = tokenizer.tokenize(text)
        masked_index = tokenized_text.index("[MASK]")
    elif model_type == "RoBERTa":
        text = "<s>%s</s>"%text
        tokenized_text = tokenizer.tokenize(text)
        masked_index = tokenized_text.index("<mask>")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
    
    predictions = []
    
    #retrieve top 500 predicted tokens 
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        predicted_token = re.sub("Ġ", "", predicted_token) #delete Ġ from the predicted token
        predictions.append(predicted_token)

    position_was = ""
    position_were = ""
    
    #retrieve the position of "was" and "were" in the predictions of the models
    for i in range(len(predictions)):
        if predictions[i] == "was":
            position_was += str(predictions.index("was")+1) #prevent position to be 0
        elif predictions[i] == "were":
            position_were += str(predictions.index("were")+1) #prevent position to be 0
    
    highest_prediction = ""
    
    #check which of the auxiliaries the models predict first
    for i in range(len(predictions)):  
        if predictions[i] == "was" or predictions[i] == "were": #check whether predicted token is an auxiliary
            highest_prediction += predictions[i]
            break #only retrieve the first instance of the auxiliary
            
    return  pd.Series([highest_prediction, position_was, position_were])

In [46]:
#create separate datasets for the model sizes
bert_base = bert_data.copy()
bert_large = bert_data.copy()

roberta_base = roberta_data.copy()
roberta_large = roberta_data.copy()

#retrieve appropriate model size and the model type together with the type
bert_base["size"], bert_base["model_and_size"] = "base", "BERT-base"
bert_large["size"], bert_large["model_and_size"] = "large", "BERT-large"
roberta_base["size"], roberta_base["model_and_size"] = "base", "RoBERTa-base"
roberta_large["size"], roberta_large["model_and_size"] = "large", "RoBERTa-large"

In [47]:
#retrieve prediction for the masked token and the positon of the auxiliaries within the top 500 predictions
bert_base[["predicted_token", "was_position", "were_position"]] = bert_base["transformed_sentence"].apply(
    predict_masked_token, model_type = "BERT", model = bert_base_model, tokenizer = bert_base_tokenizer)

In [49]:
#retrieve prediction for the masked token and the positon of the auxiliaries within the top 500 predictions
bert_large[["predicted_token", "was_position", "were_position"]] = bert_large["transformed_sentence"].apply(
    predict_masked_token,model_type = "BERT", model = bert_large_model, tokenizer = bert_large_tokenizer)

In [52]:
#retrieve prediction for the masked token and the positon of the auxiliaries within the top 500 predictions
roberta_base[["predicted_token", "was_position", "were_position"]] = roberta_base["transformed_sentence"].apply(
    predict_masked_token, model_type = "RoBERTa",model = roberta_base_model, tokenizer = roberta_base_tokenizer)

In [55]:
#retrieve prediction for the masked token and the positon of the auxiliaries within the top 500 predictions
roberta_large[["predicted_token", "was_position", "were_position"]] = roberta_large["transformed_sentence"].apply(
    predict_masked_token, model_type = "RoBERTa", model = roberta_large_model, tokenizer = roberta_large_tokenizer)

# Evaluate the Performance of the Models

In [57]:
#function to check whether model predicted the appropriate auxiliary
#assign 1 for correct predictions and 0 for incorrect ones
def evaluate_performance(predicted_token, masked_token):
    if predicted_token == masked_token:
        return 1
    if predicted_token != masked_token:
        return 0

In [58]:
#source: https://stackoverflow.com/questions/13331698/how-to-apply-a-function-to-two-columns-of-pandas-dataframe
#retrieve accuracy value of the models
bert_base["accuracy"] = bert_base[["predicted_token","masked_token"]].apply(
    lambda x: evaluate_performance(*x), axis=1)
bert_large["accuracy"] = bert_large[["predicted_token","masked_token"]].apply(
    lambda x: evaluate_performance(*x), axis=1)
roberta_base["accuracy"] = roberta_base[["predicted_token","masked_token"]].apply(
    lambda x: evaluate_performance(*x), axis=1)
roberta_large["accuracy"] = roberta_large[["predicted_token","masked_token"]].apply(
    lambda x: evaluate_performance(*x), axis=1)

# Preparation of the Data for the Analysis

In [63]:
#combine datasets
results = pd.concat([bert_base, bert_large, roberta_base, roberta_large])

In [64]:
#when model fails to make a prediction insert 0 to the position value
results["was_position"] = results["was_position"].replace("", 0, regex=True)
results["was_position"] = results["was_position"].astype(int)

results["were_position"] = results["were_position"].replace("", 0, regex=True)
results["were_position"] = results["were_position"].astype(int)

In [65]:
results["was_position_interval"], results["were_position_interval"] = "", ""

In [67]:
#define position intervals
def assign_position_interval(position):
    if position > 0 and position <= 5:
        return 1
    if position > 5 and position <= 10:
        return 2
    if position > 10 and position <= 15:
        return 3
    if position > 15 and position <= 20:
        return 4
    else:
        return 5

In [68]:
#assign position intervals to the dataset
results["was_position_interval"] = results["was_position"].apply(assign_position_interval)
results["were_position_interval"] = results["were_position"].apply(assign_position_interval)

# Save the Data

In [69]:
#reorder columns
results = results[["id", "original_sentence", "transformed_sentence", "relative_pronoun","number",
                   "model","size", "model_and_size", "N1", "N2", "exchanged_N2", "masked_token", 
                   "predicted_token", "accuracy", "was_position", "were_position", 
                   "was_position_interval", "were_position_interval"]]

In [75]:
results.head()

Unnamed: 0,id,original_sentence,transformed_sentence,relative_pronoun,number,model,size,model_and_size,N1,N2,exchanged_N2,masked_token,predicted_token,accuracy,was_position,were_position,was_position_interval,were_position_interval
0,0,A client looked at the clerk of the solicitor who,A client looked at the clerk of the offices wh...,who,singular,BERT,base,BERT-base,clerk,solicitor,office,was,was,1,8,34,2,5
1,1,A client noticed the hairdresser of the actres...,A client noticed the hairdresser of the salons...,who,singular,BERT,base,BERT-base,hairdresser,actress,salon,was,was,1,6,22,2,5
2,2,A customer frowned at the assistant of the pha...,A customer frowned at the assistant of the lab...,who,singular,BERT,base,BERT-base,assistant,pharmacist,laboratory,was,was,1,6,19,2,4
3,3,A fan looked at the guitarist of the singer who,A fan looked at the guitarist of the bands who...,who,singular,BERT,base,BERT-base,guitarist,singer,band,was,was,1,11,13,3,3
4,4,A man asked for the apprentice of the builder who,A man asked for the apprentice of the colleges...,who,singular,BERT,base,BERT-base,apprentice,builder,college,was,were,0,16,10,4,2


In [75]:
#save dataset
output_file_path = "Results/evaluated_data.csv"
results.to_csv(output_file_path, index=False)