In [6]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA 
import numpy as np
import os 

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = None

# * 1. Convert document into embeddings (vectors) using an embedding model.
# * 2. Store vectors in a vector database (FAISS, ChromaDB, Pinecone, etc.).
# * 3. Perform similarity search: Retrieve top-k similar vector and return their indices and distances.
# * 4. Pass retrieved context into LLM (e.g., GPT-4) via a prompt.

# Load text document
loader = TextLoader("data.txt")
documents = loader.load() 

# Split the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
docs = text_splitter.split_documents(documents)

# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings()

# Convert document chunks to vector embeddings
vector_store = FAISS.from_documents(docs, embeddings)

# Define the query text
query = "Who is Tom ?"

# Convert query into an embedding vector
query_vector = embeddings.embed_query(query)

# Perform similarity search
k = 3 # Number of top results to retrieve
distances, indices = vector_store.index.search(
    np.array([query_vector]), k
)

# Retrieve matched documents
retrieved_docs = [docs[i] for i in indices[0]]

for i, doc in enumerate(retrieved_docs):
    print(f"Result {i+1}:")
    print(f"Distance: {distances[0][i]}")
    print(f"Document: {doc.page_content}\n")
"""
1. The query is embedded into a high-dimensional vector
2. FAISS performs a similarity search and returns:
    (1) indices: The indexes of the closest documents
    (2) distances: The similarity scores (lower = more similar).
3. We retrieve the corresponding documents and display their distances.
"""

# Step 3: Pass Retrieved Chunks into LLM Prompt
from langchain.chat_models import ChatOpenAI

# Initialize GPT-4 model
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

# Format retrieved documents into a single string
retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

# Construct the prompt with retrieved context
prompt = f"""
Use the following retrieved information to answer the query:
{retrieved_text}
Question: {query}
Answer:
"""

# Generate response using GPT-4
response = llm.predict(prompt)
print("GPT-4 Response:", response)

Result 1:
Distance: 0.4170105755329132
Document: And what of Tom Lefroy? He returned to London and in 1796 became engaged to a Miss Mary Paul, the sister of a friend; they married two years later. He rose through the ranks to become Lord Chief Justice of Ireland. Years later, when asked about Jane Austen, he said that he had loved her, although he qualified this by saying it was ‘a boyish love’.

Result 2:
Distance: 0.42943987250328064
Document: Jane Austen famously never married, but she did have admirers. The best known of these is Tom Lefroy, a clever young Irishman whom she met in December 1795. He had moved to London to study law and was spending the Christmas holidays with his uncle and aunt who lived in Ashe, near Steventon.

Jane had just turned 20 and was a bright, lively, pretty girl. Like her much-loved heroine Elizabeth Bennet, she enjoyed music and dancing, wit, laughter and lively conversation.

Result 3:
Distance: 0.43448787927627563
Document: Although Tom only stayed in

  llm = ChatOpenAI(model_name="gpt-4", temperature=0)
  response = llm.predict(prompt)


GPT-4 Response: Tom Lefroy is a clever young Irishman who moved to London to study law. He is known as one of Jane Austen's admirers. He later became engaged to Miss Mary Paul and eventually rose through the ranks to become Lord Chief Justice of Ireland.
