In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [2]:
def semanticScore(tokenizer, model, sentences):
    # initialize dictionary to store tokenized sentences
    tokens = {'input_ids': [], 'attention_mask': []}

    for sentence in sentences:
        # encode each sentence and append to dictionary
        new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])

    # reformat list of tensors into single tensor
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    # getting last_hidden_state using mean pooling to calculate cosine similarity
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    mean_pooled = torch.sum(masked_embeddings, 1) / torch.clamp(mask.sum(1), min=1e-9)
    # convert from PyTorch tensor to numpy array
    mean_pooled = mean_pooled.detach().numpy()

    return mean_pooled

In [3]:
def textClean(text):
    import string
    return text.translate(text.maketrans('', '', string.punctuation)).lower()

In [4]:
def EM(sentences, answer):
    '''
    return 
    Exact Match for bigram
    '''
    answer = textClean(answer)
    answer_token = answer.split()
    answer_bigrams = [answer_token[i:i+2] for i in range(len(answer_token) - 1)]
    res = []
    for sentence in sentences:
        count = 0
        sentence = textClean(sentence)
        sentence_token = sentence.split()
        if len(sentence_token) < 2:
            res.append(0)
        else:
            for i in range(len(sentence) - 1):
                bigram = sentence_token[i:i+2]
                if bigram in answer_bigrams:
                    count += 1
            res.append(round(100*count/(len(sentence_token)-1), 2))
    return res
        

In [5]:
def f1(sentences, answer):
    '''
    return the f1 score
    '''
    answer_token = set(textClean(answer).split())
    res = []
    for sentence in sentences:
        sentence_token = set(textClean(sentence).split())
        common_token = sentence_token & answer_token
        if not common_token:
            res.append(0)
        else:
            precision = len(common_token) / len(sentence_token)
            recall = len(common_token) / len(answer_token)
            res.append(round((2 * precision * recall/(precision + recall))*100,2))
    return res

In [6]:
sentences = ['A Type I error is a false positive (claiming something has happened when it hasn’t), and a Type II error is a false negative (claiming nothing has happened when it actually has).',
            'A type I error (false-positive) occurs if an investigator rejects a null hypothesis that is actually true in the population; a type II error (false-negative) occurs if the investigator fails to reject a null hypothesis that is actually false in the population.',
            'In statistical hypothesis testing, a type I error is the mistaken rejection of an actually true null hypothesis (also known as a "false positive" finding or conclusion; example: "an innocent person is convicted"), while a type II error is the mistaken acceptance of an actually false null hypothesis (also known as a "false negative" finding or conclusion; example: "a guilty person is not convicted").',
            'A Type I error is a false positive, and a Type II error is a false negative.']
# answer is at index 0

In [7]:
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
# model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [8]:
# model.save_pretrained("./bert_model")

In [9]:
# tokenizer.save_pretrained("./bert_model")

In [13]:
model = AutoModel.from_pretrained('./bert_model')
tokenizer = AutoTokenizer.from_pretrained('./bert_model/')

In [11]:
mean_pooled = semanticScore(tokenizer, model, sentences)
mean_pooled.shape

(4, 768)

In [16]:
pd.DataFrame({'sample_answers':sentences, 
              'Bigram Exact Match': EM(sentences, sentences[0]),
              'F1 score': f1(sentences, sentences[0]),
              'semantic_score': [round(score*100, 2) for score in cosine_similarity([mean_pooled[0]], mean_pooled).reshape(-1,)]})

Unnamed: 0,sample_answers,Bigram Exact Match,F1 score,semantic_score
0,A Type I error is a false positive (claiming s...,100.0,100.0,100.0
1,A type I error (false-positive) occurs if an i...,14.63,36.36,81.38
2,"In statistical hypothesis testing, a type I er...",19.05,37.04,81.12
3,"A Type I error is a false positive, and a Type...",93.75,68.97,89.17
