Question answering system using information retrieval

In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Initialize a list of documents (or passages)
documents = [
    "The quick brown fox jumped over the lazy dog.",
    "The dog was too lazy to chase the fox.",
    "The quick brown fox is an agile animal.",
    "The lazy dog is a common household pet.",
]

# Tokenize and preprocess the documents
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [3]:
def preprocess(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(w) for w in words if w.isalpha()]
    words = [w for w in words if not w in stop_words]
    return ' '.join(words)

In [4]:
preprocessed_documents = [preprocess(d) for d in documents]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

In [5]:
# Define a function to perform question answering
def answer_question(question):
    # Preprocess the question
    preprocessed_question = preprocess(question)

    # Calculate the similarity between the question and each document
    question_vector = vectorizer.transform([preprocessed_question])
    similarities = cosine_similarity(question_vector, tfidf_matrix)[0]

    # Find the most similar document
    most_similar_index = similarities.argmax()
    most_similar_document = documents[most_similar_index]

    # Return the answer from the most similar document
    return most_similar_document

In [6]:
# Test the question answering system
question = "What did the lazy dog do?"
answer = answer_question(question)
print("Question:", question)
print("Answer:", answer)

Question: What did the lazy dog do?
Answer: The dog was too lazy to chase the fox.
