In [33]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings 
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

## Data Indexing
- Read the Document Transformer.pdf
- Split the text as chunks
- Create the embedings for the document
- Store in the chroma db


In [34]:
DOC_PATH = "Transformer.pdf"
CHROMA_PATH = "chromadb"

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
# Load and split the document
loader = PyPDFLoader(DOC_PATH)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)

# Create embeddings using the updated class from langchain_openai
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Create a Chroma vector store from the document chunks
db_chroma = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=CHROMA_PATH)

print("Vector store created and automatically persisted at:", CHROMA_PATH)


Vector store created and automatically persisted at: chromadb
