In [198]:
import numpy as np
import spacy
import re

nlp = spacy.load("en_core_web_md")

### process utils

In [42]:
# remove stopwords to avoid the same entities mapping differently
def preprocess(sentence, nlp): 
    doc = nlp(sentence)
    
    # Remove stop words
    filtered_tokens = [token for token in doc if not token.is_stop]
    
    # Store named entities before removing stopwords
    entities_before = [(token.text, token.ent_type_) for token in doc]
    
    # Remove stop words
    filtered_tokens = [token for token in doc if not token.is_stop]
    
    # Access entities for the remaining tokens
    filtered_entities = [(token.text, token.ent_type_) for token in filtered_tokens]

    return merge_entities_after_stopwords(filtered_tokens)

# merges split entities after stopword removal
def merge_entities_after_stopwords(doc):
    merged_tokens = []
    current_entity = None

    for token in doc:
        if token.is_stop:
            continue  # Skip stopwords
        if token.ent_type_:
            if current_entity is None:
                current_entity = (token.text, token.ent_type_)
            else:
                current_entity = (current_entity[0] + " " + token.text, token.ent_type_)
        else:
            if current_entity is not None:
                merged_tokens.append(current_entity)
                current_entity = None
            else:
                merged_tokens.append((token.text, ""))
    
    if current_entity is not None:
        merged_tokens.append(current_entity)

    return merged_tokens

# removes entities from answer if they are in question
def new_information(extracted_ents, question_ents):
    return [extracted_ent for extracted_ent in extracted_ents if extracted_ent not in question_ents]

In [297]:
def extract_closed_answer(answer, nlp):
    doc = nlp(answer)
    root = [token for token in doc if token.dep_ == "ROOT"]

    for token in doc:
        if token.head in root and token.dep_ == "neg" or "no" in answer.lower():
            return 'no'
    return 'yes'


def extract_open_answer(question, answer, nlp):
    entity_extraction = []
    question_ents = preprocess(question, nlp)
    answer_ents = preprocess(answer, nlp)
    
    if 'who' in question.lower():
        temp = [ent for ent in answer_ents if ent[1] == 'PERSON']
        entity_extraction += new_information(temp, question_ents)
    
    if'where' in question.lower():        
        temp = [ent for ent in answer_ents if ent[1] in ['GPE', 'LOC', 'FAC', 'ORG']]
        entity_extraction += new_information(temp, question_ents)

    if 'when' in question.lower():
        temp = [ent for ent in answer_ents if ent[1] in ['DATE', 'TIME', 'EVENT']]
        entity_extraction += new_information(temp, question_ents)
    
    return entity_extraction

In [321]:
closed_answers = [ "Managua is not the capital of Nicaragua",
                  "Paris isn't located in France",
                 "No.",
                  "It is thought that Managua is the capital, but this is not the case.",
                  "Yes, Managua is the capital of Nicaragua",
                  "Currently, the president of the US is Biden",
                  "Most people think Managua is the capital of Nicaragua. However, Managua is not the capital of Nicaragua."
                ]

questions = [
            "When can we expect the package?",
            "When is it Christmas?",
            "Where is The Netherlands located ?",
            "Who is the president of the US?",
            "What is the capital of Nicaragua?",
             "What is the capital of Nicaragua and its population?",
              "the capital of nicaragua is...",
             "Managua is not the capital of Nicaragua. Yes or no?",
            ]

answers = [
            "The package is expected around Easter.",
            "Christmas is celebrated on 25th of december.",
            "The Netherlands is located in Europe.",
            "Joe Biden is the president of the US.",
            "Managua is the capital of Nicaragua.",
           "Managua is the capital of Nicaragua. The population of Managua is 1.3 million people.",
           "Prior to 1979, Nicaragua was known as the Republic of Nicaragua. It is a republic with a presidential system of government. The capital of Nicaragua is Managua.",
           "Most people think Managua is the capital of Nicaragua. However, Managua is not the capital of Nicaragua.",
          ]

for answer in closed_answers:
    print(f"{answer}: ", extract_closed_answer(answer, nlp))

print("\n")
for question,answer in zip(questions, answers):
    print(extract_open_answer(question, answer, nlp))

Managua is not the capital of Nicaragua:  no
Paris isn't located in France:  no
No.:  no
It is thought that Managua is the capital, but this is not the case.:  no
Yes, Managua is the capital of Nicaragua:  yes
Currently, the president of the US is Biden:  yes
Most people think Managua is the capital of Nicaragua. However, Managua is not the capital of Nicaragua.:  no


[('Easter', 'DATE')]
[('25th december', 'DATE')]
[('Europe', 'LOC')]
[('Joe Biden', 'PERSON')]
[]
[]
[]
[]


## Vector utils

In [322]:
def vectorize_nouns(nouns):
    vec = 0
    for token in tokens:
        vec += token.vector
    return vec

def similarity(vec1, vec2):
    return (np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))


In [323]:
question = "What is the capital of Nicaragua?"
answer = "Managua is the capital of Nicaragua. The population of Managua is 1.3 million people."

q = nlp(question)
a = nlp(answer)

q_nouns_text = list(set([tok.text for tok in q if tok.pos_ in ["NOUN", "PROPN"]]))
q_nouns_text_lower = [noun.lower() for noun in q_nouns_text]

a_ents = [ent.text for ent in a.ents]
filter = [ent for noun in q_nouns_text_lower for ent in a_ents if re.search(noun, ent.lower())]
a_ents = [ent for ent in a_ents if not ent in filter]

q_vectorized = np.sum([nlp(noun).vector for noun in q_nouns_text], axis=0)

idx = np.argmax([similarity(q_vectorized, nlp(ent).vector) for ent in a_ents])

a_ents[idx]

'1.3 million'