In [16]:
import nltk
from nltk.corpus import stopwords

In [17]:
original_model_answer="London is the capital city America rebuttal of England".lower() 
original_candidate_answer="London is the capital city USA of England".lower() 

In [18]:
#getting all the stopwords in english
stop_words=stopwords.words("english")

In [19]:
#tokenize model and candiadte answer
from nltk.tokenize import word_tokenize

model_answer_tokenized = word_tokenize(original_model_answer)
candidate_answer_tokenized = word_tokenize(original_candidate_answer)

In [20]:
#remove stop words from model and candiadte answer
def removing_stopwords(tokenized_answer):
    filtered_sentence=[]
    for word in tokenized_answer:
         if word not in stop_words:
                filtered_sentence.append(word)
    return filtered_sentence

In [21]:
#removing stopwords from both model and candidate answer using above created function
model_answer=removing_stopwords(model_answer_tokenized)
candidate_answer=removing_stopwords(candidate_answer_tokenized)
print(candidate_answer,model_answer)

['london', 'capital', 'city', 'usa', 'england'] ['london', 'capital', 'city', 'america', 'rebuttal', 'england']


In [22]:
#getting percentage of proper nouns matched 
def proper_noun_match_percentage(candidate_answer,model_answer):
    #pos tagging both answers
    candidate_answer_POS=nltk.pos_tag(candidate_answer)
    model_answer_POS=nltk.pos_tag(model_answer)
    
    #function to get proper nouns from an answer
    def getting_proper_noun(answer):
        proper_nouns=[]
        for word in answer:
            if word[1] == 'NNP':
                proper_nouns.append(word[0])
        return proper_nouns
    
    #calling above defined function to get proper nouns in both candidate and model answer
    candidate_answer_proper_nouns = getting_proper_noun(candidate_answer_POS)
    model_answer_proper_nouns = getting_proper_noun(model_answer_POS)
    
    #checking the number of proper nouns in model answer
    proper_noun_count = len(model_answer_proper_nouns)
    
    #variable that will count the number of proper nouns that the candidate answer has which is also available in model answer
    proper_noun_match_count = 0
    
    #finding the number of proper nouns that match
    for word in candidate_answer_proper_nouns:
        if word in model_answer_proper_nouns:
            proper_noun_match_count += 1
    
    #calculating percentage of proper nouns that are in candidate answer comapre to the number of proper nouns in the model answer
    proper_noun_match_percentage = ( proper_noun_match_count / proper_noun_count) * 100
    
    return proper_noun_match_percentage

In [23]:
from nltk.corpus import wordnet

#calculating percentage of words that match
def word_match_percentage(candidate_answer,model_answer):
    unmatched_words=[]
    matched_words_count=0
    
    #checks for word in candidate answer that matches with model answer and increase matched_words_count. The words that don't match are inserted into unmatched_words list 
    for word in model_answer:
        if word in candidate_answer:
            matched_words_count += 1
        else:
            unmatched_words.append(word)

    #find synonyms for unmatched words
    unmatched_words_synonyms = []
    for word in unmatched_words:
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                unmatched_words_synonyms.append(l.name())
    
    #checking if the synonyms of unmatched words match with words in candidate answer, and is it does increase matched_words_count
    for word in unmatched_words_synonyms:
        if word in candidate_answer:
            matched_words_count += 1
            
    #finding the precentage of similair words in candidate answer compared to model answer        
    model_answer_length = len(model_answer)
    word_match_percentage = ( matched_words_count / model_answer_length) * 100
    
    return word_match_percentage

In [25]:
from nltk.util import bigrams
#Bigram similarity
def bigram_similarity_percentage(candidate_answer,model_answer):
    candidate_answer_bigram_tokenized = []
    model_answer_bigram_tokenized = []
    
    #getting bigrams of both model and candidate answer
    candidate_bigram = bigrams(candidate_answer)
    model_bigram = bigrams(model_answer)
    
    for w in candidate_bigram:
        candidate_answer_bigram_tokenized.append(w)
        
    for w in model_bigram:
        model_answer_bigram_tokenized.append(w)

    
    #checking the number of bigrams in model answer
    bigrams_count = len(model_answer_bigram_tokenized)
    
    #variable that will count the number of bigrams that the candidate answer has which is also available in model answer
    bigram_similarity_count = 0
    
    #finding the number of bigrams that match
    for bigram in candidate_answer_bigram_tokenized:
        if bigram in model_answer_bigram_tokenized:
            bigram_similarity_count += 1
    
    #calculating percentage of bigrams that are in candidate answer comapred to the number of bigrams in the model answer
    bigram_similarity_percentage = ( bigram_similarity_count / bigrams_count) * 100
    
    return bigram_similarity_percentage

In [41]:
#cosine similarity
def cosine_similarity(candidate_answer,model_answer):

    # form a list containing keywords of both strings. Basically union of candidate_answer and model_answer  
    keywords_union = list(set(candidate_answer) | set(model_answer))
    A =[];B =[] 
    for w in keywords_union: 
        if w in candidate_answer: A.append(1) # create a vector 
        else: A.append(0) 
        if w in model_answer: B.append(1) 
        else: B.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(keywords_union)): 
            c+= A[i]*B[i] 
    cosine = c / float((sum(A)*sum(B))**0.5) 
    print("similarity: ", cosine)

In [47]:
#jaccard similarity
def jaccard_similarity(candidate_answer,model_answer):
    
    #getting intersection of candidate_answer and model_answer
    keywords_intersection = list(set(candidate_answer) & set(model_answer)) 
    
    #form a list containing keywords of both strings. Basically union of candidate_answer and model_answer  
    keywords_union = list(set(candidate_answer) | set(model_answer))
    
    return (len(keywords_intersection) / len(keywords_union))

In [57]:
#Dice similarity
def dice_similarity(candidate_answer,model_answer):
    
    #getting intersection of candidate_answer and model_answer
    keywords_intersection = list(set(candidate_answer) & set(model_answer))
    
    return ((2 * len(keywords_intersection)) / (len(candidate_answer) + len(model_answer)))