In [1]:

!pip install spacy && python -m spacy download en_core_web_sm

import wikipedia
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy

# Load Spacy NER
nlp = spacy.load('en_core_web_sm')

# Preprocess the user's question
def preprocess(question):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(question)
    filtered_tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w in stop_words]
    return filtered_tokens

# Retrieve the Wikipedia page for the given question
def get_wiki_page(question):
    query = " ".join(preprocess(question))
    try:
        page = wikipedia.page(query)
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0]).content

# Extract relevant data from the Wikipedia page
def extract_data(page_content):
    doc = nlp(page_content)
    entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'WORK_OF_ART', 'LANGUAGE']]
    sentences = sent_tokenize(page_content)
    return entities, sentences

# Answer the user's question using the extracted data
def answer_question(question, extracted_data):
    # Extract entities from the question
    doc = nlp(question)
    print(doc)
    question_entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'WORK_OF_ART', 'LANGUAGE']]
    print(question_entities)
    # Compare question with each sentence in the Wikipedia page to find the most relevant sentence
    max_sim = -1
    best_sentence = None
    for sentence in extracted_data[1]:
        sentence_entities = [entity.text for entity in nlp(sentence).ents if entity.label_ in ['PERSON', 'ORG', 'GPE']]
        combined_entities = list(set(question_entities + sentence_entities))
        sentence_tokens = [token.lemma_ for token in nlp(sentence) if not token.is_stop and not token.is_punct]
        combined_tokens = list(set(preprocess(question) + sentence_tokens))
        sentence_embedding = np.mean([nlp(token).vector for token in combined_tokens], axis=0).reshape(1,-1)
        question_embedding = np.mean([nlp(token).vector for token in combined_entities], axis=0).reshape(1,-1)
        sim = cosine_similarity(sentence_embedding, question_embedding)[0][0]
        if sim > max_sim:
            max_sim = sim
            best_sentence = sentence

    # If similarity is high enough, return the best sentence
    if max_sim > 0.2:
        return best_sentence

    # If no match is found, return "Sorry, I don't know the answer."
    return "Sorry, I don't know the answer."

[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl#egg=en_core_web_sm==3.3.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:

exit_conditions = (":q", "quit", "exit")
while True:
    query = input("> ")
    if query in exit_conditions:
        break
    else:
        # Get the Wikipedia page for the question
        page_content = get_wiki_page(query)

        # Extract the relevant data
        extracted_data = extract_data(page_content)
        # print(extracted_data)
        # Answer the user's question
        try:
            answer = answer_question(query, extracted_data)
            print(answer)
        except:
            print('Something went wrong')
        # Display the answer to the user

>  what is an apple?


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


what is an apple?
[]
Something went wrong


>  who owns apple?


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


who owns apple?
[]
Something went wrong


>  what is AI?




  lis = BeautifulSoup(html).find_all('li')


what is AI?
['AI']
The Xi Butterfly Master Trophy was for completing Xi with all 20 butterflies; and the Xi Butterfly Collector Trophy was for completing Xi with at least 10 butterflies.


KeyboardInterrupt: Interrupted by user