In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
corpus = [
    "The cat sat on the mat.",
    "The dog chased the cat.",
    "The cat and dog played together.",
    "The cat is sleeping.",
    "The dog barked loudly.",
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)


# Function to retrieve documents based on TF-IDF similarity
def retrieve_documents(query, corpus, vectorizer, X, top_n=3):
    # Transform the query using the same vectorizer
    query_vector = vectorizer.transform([query])

    # Compute cosine similarity between query and documents
    cosine_similarities = cosine_similarity(query_vector, X).flatten()

    # Get the indices of the top N similar documents
    related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]

    # Retrieve the documents
    related_docs = [corpus[i] for i in related_docs_indices]

    return related_docs


# Example query
query = "What did the cat do?"

# Retrieve documents
retrieved_docs = retrieve_documents(query, corpus, vectorizer, X)

# Display results
print("Query:", query)
print("Retrieved Documents:")
for doc in retrieved_docs:
    print("-", doc)


# Now, let's simulate a simple generation step
def generate_answer(query, retrieved_docs):
    # Here, we'll just concatenate the retrieved documents for simplicity
    context = " ".join(retrieved_docs)
    # In a real RAG system, you would use a language model here to generate an answer based on the context
    return f"Based on the context: '{context}', the answer to '{query}' could be: The cat sat on the mat, chased by a dog, and played with the dog."


# Generate an answer
answer = generate_answer(query, retrieved_docs)
print("\nGenerated Answer:")
print(answer)

Query: What did the cat do?
Retrieved Documents:
- The dog chased the cat.
- The cat sat on the mat.
- The cat is sleeping.

Generated Answer:
Based on the context: 'The dog chased the cat. The cat sat on the mat. The cat is sleeping.', the answer to 'What did the cat do?' could be: The cat sat on the mat, chased by a dog, and played with the dog.
