In [1]:
import nltk

In [84]:
#getting percentage of proper nouns matched 
def proper_noun_match_percentage(candidate_answer,model_answer):
    #pos tagging both answers
    candidate_answer_POS=nltk.pos_tag(candidate_answer)
    model_answer_POS=nltk.pos_tag(model_answer)
    
    #function to get proper nouns from an answer
    def getting_proper_noun(answer):
        proper_nouns=[]
        for word in answer:
            if (word[1] == 'NNP' or word[1] == 'NNPS'):
                proper_nouns.append(word[0])
        return proper_nouns
    
    #calling above defined function to get proper nouns in both candidate and model answer
    candidate_answer_proper_nouns = getting_proper_noun(candidate_answer_POS)
    model_answer_proper_nouns = getting_proper_noun(model_answer_POS)
    
    #checking the number of proper nouns in model answer
    proper_noun_count = len(model_answer_proper_nouns)
    
    #variable that will count the number of proper nouns that match
    proper_noun_match_count = 0
    
    #finding the number of proper nouns that match
    for word in candidate_answer_proper_nouns:
        if word in model_answer_proper_nouns:
            proper_noun_match_count += 1
    
    #calculating percentage of proper nouns that match.
    if proper_noun_count != 0:
        proper_noun_match_percentage = ( proper_noun_match_count / proper_noun_count) * 100
    else:
        proper_noun_match_percentage = 100  
    return proper_noun_match_percentage

In [282]:
from nltk.corpus import wordnet

#calculating percentage of words that match
def word_match_percentage(candidate_answer,model_answer):
    unmatched_words=[]
    matched_words_count=0
    
    #checks for words that macth and increase matched_words_count by 1. The words that don't match are inserted into unmatched_words list 
    for word in model_answer:
        if word in candidate_answer:
            matched_words_count += 1
        else:
            unmatched_words.append(word)

    #find synonyms for unmatched words
    unmatched_words_synonyms = []
    for word in unmatched_words:
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                unmatched_words_synonyms.append(l.name())
    
    #checking if the synonyms of unmatched words match with words in candidate answer, and if it does increase matched_words_count
    for word in candidate_answer:
        if word in unmatched_words_synonyms:
            matched_words_count += 1
            
    #finding the precentage of similair words in candidate answer compared to model answer        
    model_answer_length = len(model_answer)
    word_match_percentage = ( matched_words_count / model_answer_length) * 100
    
    return word_match_percentage

In [26]:
from nltk.util import bigrams
#Bigram similarity
def bigram_similarity_percentage(candidate_answer,model_answer):
    candidate_answer_bigram_tokenized = []
    model_answer_bigram_tokenized = []
    
    #getting bigrams of both model and candidate answer
    candidate_bigram = bigrams(candidate_answer)
    model_bigram = bigrams(model_answer)
    
    for w in candidate_bigram:
        candidate_answer_bigram_tokenized.append(w)
        
    for w in model_bigram:
        model_answer_bigram_tokenized.append(w)

    
    #checking the number of bigrams in model answer
    bigrams_count = len(model_answer_bigram_tokenized)
    
    #variable that will count the number of bigrams that are present in both answers
    bigram_similarity_count = 0
    
    #finding the number of bigrams that match
    for bigram in candidate_answer_bigram_tokenized:
        if bigram in model_answer_bigram_tokenized:
            bigram_similarity_count += 1
    
    #calculating percentage of bigrams that match
    bigram_similarity_percentage = ( bigram_similarity_count / bigrams_count) * 100
    
    return bigram_similarity_percentage

In [27]:
#cosine similarity
def cosine_similarity(candidate_answer,model_answer):

    # form a list containing keywords of both strings. Basically union of candidate_answer and model_answer  
    keywords_union = list(set(candidate_answer) | set(model_answer))
    A =[];B =[] 
    for w in keywords_union: 
        if w in candidate_answer: A.append(1) # create a vector 
        else: A.append(0) 
        if w in model_answer: B.append(1) 
        else: B.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(keywords_union)): 
            c+= A[i]*B[i] 
    cosine = c / float((sum(A)*sum(B))**0.5) 
    return cosine

In [28]:
#jaccard similarity
def jaccard_similarity(candidate_answer,model_answer):
    
    #getting intersection of candidate_answer and model_answer
    keywords_intersection = list(set(candidate_answer) & set(model_answer)) 
    
    #form a list containing keywords of both strings. Basically union of candidate_answer and model_answer  
    keywords_union = list(set(candidate_answer) | set(model_answer))
    
    return (len(keywords_intersection) / len(keywords_union))

In [7]:
#Dice similarity
def dice_similarity(candidate_answer,model_answer):
    
    #getting intersection of candidate_answer and model_answer
    keywords_intersection = list(set(candidate_answer) & set(model_answer))
    
    return ((2 * len(keywords_intersection)) / (len(candidate_answer) + len(model_answer)))

# Preparation of answers

In [158]:
#Getting candidate answer through voice input
import speech_recognition as sr

r = sr.Recognizer()
with sr.Microphone() as source:
    print("Give your answer!")
    audio = r.listen(source)
    
try:
    original_candidate_answer = r.recognize_google(audio) 
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Speech Recognition service; {0}".format(e))

Give your answer!


In [254]:
original_model_answer="Hertfordshire is one of the home counties in southern England. It is bordered by Bedfordshire and Cambridgeshire to the north, Essex to the east, Greater London to the south, and Buckinghamshire to the west"
original_candidate_answer

'Hertfordshire am one of the home counties on northern England. It am bordered by Bedfordshire and Oxfordshire to the north, Essex to the east, Greater London to the south, and Buckinghamshire to the west'

In [255]:
#getting all the stopwords in english
from nltk.corpus import stopwords
stop_words=stopwords.words("english")

In [256]:
#tokenize model and candiadte answer
from nltk.tokenize import word_tokenize

model_answer_tokenized = word_tokenize(original_model_answer)
candidate_answer_tokenized = word_tokenize(original_candidate_answer)

In [257]:
print("Tokenized candidate answer ->", candidate_answer_tokenized)
print("Tokenized model answer ->", model_answer_tokenized)

Tokenized candidate answer -> ['Hertfordshire', 'am', 'one', 'of', 'the', 'home', 'counties', 'on', 'northern', 'England', '.', 'It', 'am', 'bordered', 'by', 'Bedfordshire', 'and', 'Oxfordshire', 'to', 'the', 'north', ',', 'Essex', 'to', 'the', 'east', ',', 'Greater', 'London', 'to', 'the', 'south', ',', 'and', 'Buckinghamshire', 'to', 'the', 'west']
Tokenized model answer -> ['Hertfordshire', 'is', 'one', 'of', 'the', 'home', 'counties', 'in', 'southern', 'England', '.', 'It', 'is', 'bordered', 'by', 'Bedfordshire', 'and', 'Cambridgeshire', 'to', 'the', 'north', ',', 'Essex', 'to', 'the', 'east', ',', 'Greater', 'London', 'to', 'the', 'south', ',', 'and', 'Buckinghamshire', 'to', 'the', 'west']


In [258]:
#remove stop words from model and candiadte answer
def removing_stopwords(tokenized_answer):
    filtered_sentence=[]
    for word in tokenized_answer:
         if word not in stop_words:
                filtered_sentence.append(word)
    return filtered_sentence

In [259]:
#removing stopwords from both model and candidate answer using above created function
model_answer=removing_stopwords(model_answer_tokenized)
candidate_answer=removing_stopwords(candidate_answer_tokenized)
print("Candidate answer without stopwords ->", candidate_answer)
print("Model answer without stopwords ->", model_answer)

Candidate answer without stopwords -> ['Hertfordshire', 'one', 'home', 'counties', 'northern', 'England', '.', 'It', 'bordered', 'Bedfordshire', 'Oxfordshire', 'north', ',', 'Essex', 'east', ',', 'Greater', 'London', 'south', ',', 'Buckinghamshire', 'west']
Model answer without stopwords -> ['Hertfordshire', 'one', 'home', 'counties', 'southern', 'England', '.', 'It', 'bordered', 'Bedfordshire', 'Cambridgeshire', 'north', ',', 'Essex', 'east', ',', 'Greater', 'London', 'south', ',', 'Buckinghamshire', 'west']


# Calling functions to evaluate

In [283]:
proper_noun_match = proper_noun_match_percentage(candidate_answer,model_answer)
word_match = word_match_percentage(candidate_answer,model_answer)
bigram_match = bigram_similarity_percentage(candidate_answer,model_answer) 
cosine = cosine_similarity(candidate_answer,model_answer)
jaccard = jaccard_similarity(candidate_answer,model_answer) 
dice = dice_similarity(candidate_answer,model_answer)

In [284]:
proper_noun_match

87.5

In [285]:
word_match

90.9090909090909

In [286]:
bigram_match

80.95238095238095

In [287]:
cosine

0.9

In [288]:
jaccard

0.8181818181818182

In [289]:
dice

0.8181818181818182

## Assigning different weights to the different methods and calculating total mark
#### proper_noun_match_percentage -> 30% of total marks
#### word_match_percentage -> 30% of total marks
#### bigram_similarity_percentage -> 25% of total marks
#### cosine_similarity -> 5% of total marks
#### jaccard_similarity -> 5% of total marks
#### dice_similarity -> 5% of total marks

In [290]:
#final_mark for each question is given out of 100
final_mark = (proper_noun_match * 0.3) + (word_match * 0.3) + (bigram_match * 0.25) + ((cosine + jaccard + dice) * 5)

In [291]:
final_mark

86.4426406926407

## Grammar Check 

In [273]:
import requests

In [292]:

#checking the grammar of the original candidate answer using textgears API.
response = requests.get("https://api.textgears.com/grammar?text="+original_candidate_answer+"&language=en-GB&key=hRgDhKSuk5zLKmw4")

#grammatical errors present in answer
response.json()['response']['errors']

[{'id': 'e151357845',
  'offset': 14,
  'length': 2,
  'description': {'en': 'Consider using third-person verb forms for singular and mass nouns: "is".'},
  'bad': 'am',
  'better': ['is'],
  'type': 'grammar'},
 {'id': 'e1621268180',
  'offset': 66,
  'length': 2,
  'description': {'en': 'Did you mean "is"?'},
  'bad': 'am',
  'better': ['is'],
  'type': 'grammar'},
 {'id': 'e1427092122',
  'offset': 199,
  'length': 4,
  'description': {'en': 'Please add a punctuation mark at the end of paragraph'},
  'bad': 'west',
  'better': ['west.', 'west!', 'west?', 'west:', 'west,', 'west;'],
  'type': 'grammar'}]

In [293]:
#getting number of identified errors
no_of_errors = len(response.json()['response']['errors'])

no_of_errors

3

In [294]:
#List of all the words in the original answer
original_candidate_answer_wordlist = original_candidate_answer.split()

#number of words in the original answer
no_of_words = len(original_candidate_answer_wordlist)

no_of_words

34

In [295]:
#percentage of grammatical errors in original candidate answer
grammar_error_percentage = ( no_of_errors / no_of_words ) * 100

grammar_error_percentage

8.823529411764707

### Reducing grammar error percentage from total marks

In [296]:
final_mark -= grammar_error_percentage

final_mark


77.61911128087598

In [297]:
#If final mark is less than zero it is converted to zero
if final_mark < 0:
    final_mark = 0

In [298]:
final_mark

77.61911128087598