In [49]:
# !pip install langchain qdrant_client openai tiktoken
# pip install -U langchain-openai

In [50]:
from dotenv import load_dotenv
import streamlit as st
from langchain.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from qdrant_client import QdrantClient, models
import qdrant_client
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

In [51]:
model = OpenAIEmbeddings(model="text-embedding-3-large")

In [52]:
# Create a Qdrant client

client = qdrant_client.QdrantClient(
    st.secrets["QDRANT_HOST"],
    api_key=st.secrets["QDRANT_API_KEY"]
    )



In [53]:
# delete collection

client.delete_collection(collection_name=st.secrets["QDRANT_COLLECTION_NAME"])

True

In [54]:
# create collection
vectors_config = models.VectorParams(
    size=1536, # maybe 3072 for vector size OpenAI text-embedding-3-large. 1536 OpenAI text-embeddings-3-small.
    distance=models.Distance.COSINE, #telling Qdrant which distance to find siliarity in search
    on_disk=True, # need this for the binary quantization attempt 
)

client.create_collection( # may need to be recreate_collection
    collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
    vectors_config=vectors_config,
    optimizers_config = models.OptimizersConfigDiff( #setting indexing threshold to 0. Disabling the indexing to zero. 
                                                    # Allows faster vector uploads and paylods. Turn back on lower in code.
        default_segment_number=5,
        indexing_threshold=0,
    ),
    quantization_config = models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(always_ram=True),
    ),
)


True

In [55]:
#create vector store
openai_api_key = st.secrets["OPENAI_API_KEY"]

embeddings = OpenAIEmbeddings()

vector_store = Qdrant(
    client = client,
    collection_name = st.secrets["QDRANT_COLLECTION_NAME"],
    embeddings = embeddings,
)

In [56]:
# Define the directory where source documents are stored
directory = 'PDFs_and_TXT'

In [57]:
# Define function to load source documents from directory
def load_sources(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            loader = PyPDFLoader(file_path)
            doc = loader.load()
            documents.extend(doc)
    return documents

In [58]:
#test load_sources function

documents_loaded = load_sources(directory)

In [59]:
# chunk pdfs

def chunk_pdfs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 10000,
        chunk_overlap = 200
    )
    
    chunks = text_splitter.split_documents(documents)
    return chunks

In [60]:
chunks = chunk_pdfs(documents_loaded)
print(chunks)

[Document(page_content='Financial reporting developments  \nA comprehensive guide  \nLease \naccounting  \nAccounting Standards Codification 842, \nLeases  \nAugust 202 3', metadata={'source': 'PDFs_and_TXT\\EY-financial-reporting-developments-lease-accounting-08-31-2023-v2.pdf', 'page': 0}), Document(page_content='To our clients and other friends  \nAccounting Standard Codification (ASC) 842, Leases , requires most leases to be recognized on the balance \nsheet and requires enhanced disclosures. The Financial Accounting Standards Board (FASB or Board)  \nbelieves this result s in a faithful representation of lessees’ assets and liabilities and provides \ntransparency about the lessee’s obligations and leasing activities.  \nThe FASB held joint deliberations with the International Accounting Standards Board (IASB), which issued \na sim ilar standard (IFRS 16 , Leases ). However, there are significant differences between the FASB and \nIASB  standards ( e.g.,  lessees do not classify le

In [None]:
# add embedded chunks to vector store. I think this process does the embedding?
vector_store.add_documents(chunks)

In [63]:
# Enable indexing again after upload vectors

client.update_collection(
    collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
    optimizer_config=models.OptimizersConfigDiff(
        indexing_threshold=20000
    )
)

True