## Retrieval-Augmented Generation (RAG)

In [1]:
import pandas as pd
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### 1. TF-IDF Based RAG System

In [2]:
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the cat and dog played together",
    "the cat is sleeping",
    "the dog barked loudly",
]

In [3]:
vectorizer = TfidfVectorizer()
vector_kb = vectorizer.fit_transform(corpus)

In [4]:
def retrieve_documents(query: str, corpus: List[str], vectorizer, kb, top_n=3):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, kb).flatten()
    related_docs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    related_docs = [corpus[i] for i in related_docs_indices]
    return related_docs

In [5]:
query = "the dog chased who ?"

In [6]:
print(
    retrieve_documents(
        query,
        corpus,
        vectorizer,
        kb=vector_kb,
    )
)

['the dog chased the cat', 'the dog barked loudly', 'the cat and dog played together']
