In [None]:
# !pip install langchain qdrant_client openai tiktoken
# pip install -U langchain-openai

In [13]:
from dotenv import load_dotenv
import streamlit as st
from langchain.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from qdrant_client import QdrantClient, models
import qdrant_client
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

In [32]:
model = OpenAIEmbeddings(model="text-embedding-3-large")

In [4]:
# Create a Qdrant client

client = qdrant_client.QdrantClient(
    st.secrets["QDRANT_HOST"],
    api_key=st.secrets["QDRANT_API_KEY"]
    )



In [5]:
# delete collection

client.delete_collection(collection_name=st.secrets["QDRANT_COLLECTION_NAME"])

False

In [8]:
# create collection
vectors_config = models.VectorParams(
    size=1536, # maybe 3072 for vector size OpenAI text-embedding-3-large. 1536 OpenAI text-embeddings-3-small.
    distance=models.Distance.COSINE, #telling Qdrant which distance to find siliarity in search
    on_disk=True, # need this for the binary quantization attempt 
)

client.create_collection( # may need to be recreate_collection
    collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
    vectors_config=vectors_config,
    optimizers_config = models.OptimizersConfigDiff( #setting indexing threshold to 0. Disabling the indexing to zero. 
                                                    # Allows faster vector uploads and paylods. Turn back on lower in code.
        default_segment_number=5,
        indexing_threshold=0,
    ),
    quantization_config = models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(always_ram=True),
    ),
)


True

In [47]:
#create vector store
openai_api_key = st.secrets["OPENAI_API_KEY"]

embeddings = OpenAIEmbeddings()

vector_store = Qdrant(
    client = client,
    collection_name = st.secrets["QDRANT_COLLECTION_NAME"],
    embeddings = embeddings,
)

In [14]:
# Define the directory where source documents are stored
directory = 'PDFs_and_TXT'

In [15]:
# Define function to load source documents from directory
def load_sources(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            loader = PyPDFLoader(file_path)
            doc = loader.load()
            documents.extend(doc)
    return documents

In [22]:
#test load_sources function

documents_loaded = load_sources(directory)

In [23]:
# chunk pdfs

def chunk_pdfs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 10000,
        chunk_overlap = 200
    )
    
    chunks = text_splitter.split_documents(documents)
    return chunks

In [44]:
chunks = chunk_pdfs(documents_loaded)
print(chunks)

[Document(page_content='Charles Schwab  \nManager Treasury -  ALCO Reporting  – 3.10.24 \nhttps://www.linkedin.com/jobs/view/3851433982/?alternateChannel=search&refId=TIlrwC6qyBhGzHEH%2F81W\nXA%3D%3D&trackingId=GiWxPynaoAfEt6KbWﬂuug%3D%3D  \n \n \n \n \nPrologis  - Accounting/Banking Systems Function Lead –  Denver, \nCO \nAbout the job  \nPrologis, Inc. is the global leader in logistics real estate. In partnership with our customers and our communities, we \ndevelop modern, high-quality properties that set the standard for innovative building design and sustainability. \nPrologis owns or has investments in properties and development projects of ~1.2 billion square feet in 19 countries \nand enables 2.8% of the world’s GDP. We have committed to achieve net zero emissions by 2040. Beyond real estate, \nour Essentials platform optimizes the company’s g lobal asset portfolio to provide our customers solutions that \naddress today’s warehouse and shipping challenges. Prologis Ventures inve

In [48]:
# add embedded chunks to vector store. I think this process does the embedding?
vector_store.add_documents(chunks)

['d9e9404daafd4c24afd367ee792b8e7c',
 '19e0e7dd37c742e198a05a218e678764',
 'b4a89b4f50ad4ccab8d1a224c7c609ca',
 'fc7e1150db3a4399b1cc5536744fc4ce',
 '3e64c38af9f147e582a765d958995901',
 '47d8e37afefd4ed9bf313b01bb70e8f0',
 '6d6f1465e8f04feda3a73ce5fe976ab0',
 '0fe91c458bea49e2ac00033c847e1975',
 '7e8ffb0b7b184577b0b576a9713cc987',
 'af9ffec088c846eb91dac9bd84c7ce8c',
 'e258282860a549838e9795afbf4cf4ed',
 'a98e3dc1e8e442d387800596d8fd418d',
 'ae16e7a8203e4023bdade4c2274be91f',
 'a532a92018114c809b4e609da321e6bb']

In [None]:
# Enable indexing again after upload vectors

# client.update_collection(
#     collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
#     optimizer_config=models.OptimizersConfigDiff(
#         indexing_threshold=20000
#     )
# )