In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/george_orwell.pdf")




In [36]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

# vector = embedder.embed_query("Hi")
# print(len(vector))

vector = embedder.embed_documents([
    "hi",
    "how",
    "are",
    "you longer sentences because"
])

print(len(vector))

for v in vector:
    print(len(v))

4
1536
1536
1536
1536


In [45]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=300,
    chunk_overlap=50,
)

loader = UnstructuredFileLoader("./files/george_orwell.pdf")

docs = loader.load_and_split(text_splitter=splitter)

print(docs)

embeddings = OpenAIEmbeddings()

cashed_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

# vectorstore = Chroma.from_documents(docs, embeddings)
vectorstore = Chroma.from_documents(docs,cashed_embeddings)



[Document(page_content='george_orwell.md\n2024-07-31\nPart One\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor\ndisplay, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of\na man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for\nthe stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the\nelectric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate\nWeek. The flat was seven flights up, and Winston, who was thirty-nin

In [46]:
docs

[Document(page_content='george_orwell.md\n2024-07-31\nPart One\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor\ndisplay, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of\na man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for\nthe stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the\nelectric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate\nWeek. The flat was seven flights up, and Winston, who was thirty-nin

In [47]:
vectorstore.similarity_search("where is winston live", top_k=2)

[Document(page_content='george_orwell.md\n2024-07-31\nPart One\n1 It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into\nhis breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions,\nthough not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor\ndisplay, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of\na man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for\nthe stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the\nelectric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate\nWeek. The flat was seven flights up, and Winston, who was thirty-nin