In [1]:
# !pip install langchain qdrant_client openai tiktoken
# pip install -U langchain-openai

In [18]:
from dotenv import load_dotenv
import re
import streamlit as st
from langchain.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI, OpenAI
import openai
from qdrant_client import QdrantClient, models
import qdrant_client
from qdrant_client.models import VectorParams, Distance
from langchain_community.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client.models import PointStruct
import spacy
import os

In [7]:
# Define the directory where source documents are stored
directory = 'small_pdfs'

# Define function to load source documents from directory
def load_sources(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            try:
                reader = PdfReader(file_path)
                text = ''
                for page in reader.pages:
                    text += page.extract_text() + ' '  # Add a space between pages
                # Clean text: replace line breaks with spaces, then normalize whitespace
                cleaned_text = text.replace('\n', ' ').replace('\r', ' ')
                cleaned_text = ' '.join(cleaned_text.split())
                documents.append(cleaned_text)
            except Exception as e:
                print(f"Failed to process {filename}: {str(e)}")
    return documents

# Example usage
documents = load_sources(directory)
print(documents)

['Charles Schwab Manager Treasury - ALCO Reporting – 3.10.24 https://www.linkedin.com/jobs/view/3851433982/?alternateChannel=search&refId=TIlrwC6qyBhGzHEH%2F81W XA%3D%3D&trackingId=GiWxPynaoAfEt6KbWﬂuug%3D%3D Prologis - Accounting/Banking Systems Function Lead – Denver, CO About the job Prologis, Inc. is the global leader in logistics real estate. In partnership with our customers and our communities, we develop modern, high-quality properties that set the standard for innovative building design and sustainability. Prologis owns or has investments in properties and development projects of ~1.2 billion square feet in 19 countries and enables 2.8% of the world’s GDP. We have committed to achieve net zero emissions by 2040. Beyond real estate, our Essentials platform optimizes the company’s g lobal asset portfolio to provide our customers solutions that address today’s warehouse and shipping challenges. Prologis Ventures invests in logistics innovation and technology companies to moderniz

In [20]:
# Setup Open AI and Qdrant clients
openai_api_key = st.secrets["OPENAI_API_KEY"]
organization = st.secrets["OPEN_AI_ORGANIZATION_ID"]
project = st.secrets["PROJECT_NAME"]

# Create a OpenAI and Qdrant clients
openai_client = openai.Client(
    api_key = openai_api_key
)

client = qdrant_client.QdrantClient(":memory:")


In [15]:
# embedding a document

embedding_model = "text-embedding-3-large"

result = openai_client.embeddings.create(input=documents, model=embedding_model)

In [17]:
print(result)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.022324997931718826, -0.042278874665498734, -0.021394789218902588, 0.014573262073099613, 0.008992012590169907, 0.01572234369814396, -0.014254073612391949, 0.021248875185847282, -0.026447096839547157, 0.049939416348934174, 0.04009015113115311, 0.007551101967692375, 0.03168179839849472, -0.033943481743335724, -0.019351979717612267, 0.04001719504594803, -0.05114321410655975, 0.045744359493255615, -0.009575673379004002, -0.030715111643075943, -0.011153379455208778, -0.02878173626959324, -0.033907003700733185, 0.004869913682341576, 0.008335395716130733, -0.0029228602070361376, -0.013260027393698692, 0.027067236602306366, -0.021139437332749367, -0.024203654378652573, 0.014026081189513206, 0.02755969949066639, -0.017865469679236412, -0.01200150977820158, 0.0034426823258399963, -0.024805553257465363, 0.0013371743261814117, -0.027432022616267204, 0.03224721923470497, 0.07117004692554474, 0.015156922861933708, 0.05074194818735123, 0.0060873921

In [24]:
# convert the model outputs to Qdrant points.
# not sure if "text" is like bob, or needs to refer to my source embeddings, "result"

points = [
    PointStruct(
        id=idx,
        vector=data.embedding,
        payload={"text": text},
    )
    for idx, (data, text) in enumerate(zip(result.data, documents))
]

In [21]:
collection_name = st.secrets['QDRANT_COLLECTION_NAME_2']

client.create_collection(
    collection_name,
    vectors_config=VectorParams(
        size=3072,
        distance = Distance.COSINE,
    ),
)

client.upsert(collection_name, points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
client.search(
    collection_name=collection_name,
    query_vector=openai_client.embeddings.create(
        input=["What is Cloudflare?"],
        model=embedding_model,
    )
    .data[0]
    .embedding,
)

[ScoredPoint(id=0, version=0, score=0.12793568939101527, payload={'text': 'Charles Schwab Manager Treasury - ALCO Reporting – 3.10.24 https://www.linkedin.com/jobs/view/3851433982/?alternateChannel=search&refId=TIlrwC6qyBhGzHEH%2F81W XA%3D%3D&trackingId=GiWxPynaoAfEt6KbWﬂuug%3D%3D Prologis - Accounting/Banking Systems Function Lead – Denver, CO About the job Prologis, Inc. is the global leader in logistics real estate. In partnership with our customers and our communities, we develop modern, high-quality properties that set the standard for innovative building design and sustainability. Prologis owns or has investments in properties and development projects of ~1.2 billion square feet in 19 countries and enables 2.8% of the world’s GDP. We have committed to achieve net zero emissions by 2040. Beyond real estate, our Essentials platform optimizes the company’s g lobal asset portfolio to provide our customers solutions that address today’s warehouse and shipping challenges. Prologis Vent

In [None]:
# delete collection

client.delete_collection(collection_name=st.secrets["QDRANT_COLLECTION_NAME_2"])

In [None]:
# create collection
vectors_config = models.VectorParams(
    size=1536, # maybe 3072 for vector size OpenAI text-embedding-3-large. 1536 OpenAI text-embeddings-3-small.
    distance=models.Distance.COSINE, #telling Qdrant which distance to find siliarity in search
    on_disk=True, # need this for the binary quantization attempt 
)

client.create_collection( # may need to be recreate_collection
    collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
    vectors_config=vectors_config,
    optimizers_config = models.OptimizersConfigDiff( #setting indexing threshold to 0. Disabling the indexing to zero. 
                                                    # Allows faster vector uploads and paylods. Turn back on lower in code.
        default_segment_number=5,
        indexing_threshold=0,
    ),
    quantization_config = models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(always_ram=True),
    ),
)


In [None]:
#create vector store
openai_api_key = st.secrets["OPENAI_API_KEY"]

embeddings = OpenAIEmbeddings()

vector_store = Qdrant(
    client = client,
    collection_name = st.secrets["QDRANT_COLLECTION_NAME"],
    embeddings = embeddings,
)

In [None]:
# Define the directory where source documents are stored
directory = 'PDFs_and_TXT'

In [None]:
# Define function to load source documents from directory
def load_sources(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            loader = PyPDFLoader(file_path)
            doc = loader.load()
            documents.extend(doc)
    return documents

In [None]:
#test load_sources function

documents_loaded = load_sources(directory)

In [None]:
# chunk pdfs

def chunk_pdfs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 10000,
        chunk_overlap = 200
    )
    
    chunks = text_splitter.split_documents(documents)
    return chunks

In [None]:
chunks = chunk_pdfs(documents_loaded)
print(chunks)

In [None]:
# add embedded chunks to vector store. I think this process does the embedding?
vector_store.add_documents(chunks)

In [None]:
# Enable indexing again after upload vectors

client.update_collection(
    collection_name=st.secrets["QDRANT_COLLECTION_NAME"],
    optimizer_config=models.OptimizersConfigDiff(
        indexing_threshold=20000
    )
)