In [1]:
import os
import glob

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
# from langchain_community.embeddings import GPT4AllEmbeddings
# from langchain_community.llms import GPT4All
from langchain_community.vectorstores import Milvus, Chroma
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [2]:
# installed

# gpt4all
# chromadb
# pypdf

# langchain
# langchain_openai
# pymilvus
# pymupdf

In [3]:
# https://milvus.io/docs/integrate_with_langchain.md
# https://github.com/py-pdf/benchmarks
# Note: for best results upload .txt files
# TODO: custom Loader that stores uid in metadata
# TODO: limit upload size

# Compare:
# https://python.langchain.com/docs/integrations/vectorstores/chroma/ # no filtering
# https://python.langchain.com/docs/integrations/vectorstores/qdrant/ # has 1G free forever; poor deployment docs
# https://python.langchain.com/docs/integrations/vectorstores/milvus/ # good all round
# https://python.langchain.com/docs/integrations/vectorstores/redis/ # easy syntax for filtering but not sure if retriever can filter

# https://milvus.io/docs/integrate_with_langchain.md

# Document Processing

In [4]:
# TODO:
# custom one class loader
# custom loader for metadata

In [5]:
class CustomLoader(BaseLoader):
    def __init__(self, file_path: str, metadata: dict=None) -> None:
        self.file_path = file_path
        self.ext = file_path.split('.')[-1].lower()
        if not metadata:
            self.metadata = {'file_path': filepath}
        else:
            self.metadata = metadata

    # TODO: support more file types
    def load(self) -> list[Document]:
        if self.ext == 'txt':
            with open(self.file_path, encoding="utf-8") as f:
                doc = Document(
                    page_content=f.read(),
                    metadata=self.metadata
                )
        elif self.ext == 'pdf':
            loader = PyMuPDFLoader(self.file_path)
            doc = loader.load()[0]
            doc.metadata = self.metadata
        else:
            raise Exception(f'Unable to load {self.file_path}')
        return [doc]

In [6]:
def load_and_split(path, uid, category):
    documents = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    try:
        loader = CustomLoader(path, metadata={'file_path': path, 'category': category, 'uid': uid})
        documents.extend(loader.load_and_split(text_splitter=text_splitter))
        print(f'Loaded {path}')
    except Exception as e:
        print(e)
    return documents

In [7]:
paths = [
    './data/personal context/Interview Questions v2.txt',
    './data/personal context/resume.pdf',
    './data/domain knowledge/2402.03367.pdf',
    './data/domain knowledge/foo.bar'
]
uids = [1, 2, 3, 4]
categories = ['personal context'] * 2 + ['domain knowledge'] * 2

In [8]:
docs = []
for path, uid, category in zip(paths, uids, categories):
    docs += load_and_split(path, uid, category)

Loaded ./data/personal context/Interview Questions v2.txt
Loaded ./data/personal context/resume.pdf
Loaded ./data/domain knowledge/2402.03367.pdf
Unable to load ./data/domain knowledge/foo.bar


In [9]:
docs[-1]

Document(page_content='Document retrieval is a fundamental component of the algorithm. Traditional RAG virtual assistants rank documents\nin the order of relevance to the query, usually by vector distances. This means that the more relevant a document\nis in a query, the higher priority it takes being in the answer. Recently, however, developers and researchers have\nexplored implementing different reranking methods for documents. It has been found that reranking in retrieval-\naugmented generation plays a significant role in improving retrieval results and in increasing the accuracy, relevance,\nand comprehensiveness of answers. [6]\n∗Citation: Authors. Title. Pages.... DOI:000000/11111.\narXiv:2402.03367v1  [cs.IR]  31 Jan 2024', metadata={'file_path': './data/domain knowledge/2402.03367.pdf', 'category': 'domain knowledge', 'uid': 3})

# Vector Stores

## Embedding

In [10]:
# embedding = GPT4AllEmbeddings()
DIMENSION = 1024
embedding = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=DIMENSION)

## Chroma (for local testing)

In [11]:
# # Local Chroma
# vector_store = Chroma.from_documents(documents=[documents_pc, documents_dk], embedding=embedding)

## Milvus

In [12]:
# # run Milvus app in docker
# https://milvus.io/docs/install_standalone-docker-compose.md
# docker compose -f milvus-standalone-docker-compose.yml up

In [13]:
COLLECTION_NAME = 'lockedinai'
CONNECTION_ARGS = {'uri': 'http://localhost:19530'}

### Legacy ORM module

https://milvus.io/api-reference/pymilvus/v2.4.x/ORM/Connections/connect.md

In [14]:
from pymilvus import connections, db, utility, Collection, CollectionSchema, FieldSchema, DataType

connections.connect(alias='default', uri=CONNECTION_ARGS['uri'])

# info
connections_ = connections.list_connections()
dbs = db.list_database()
collections = utility.list_collections()
users = utility.list_users(True)
print(connections_)
print(dbs)
print(collections)
print(users)

try: # new collection
    schema = CollectionSchema([
        FieldSchema("id", DataType.INT64, is_primary=True),
        FieldSchema("vector", DataType.FLOAT_VECTOR, dim=5)
    ])
    collection = Collection(name='lockedinai', schema=schema)
    index_params = {
        "index_type": "AUTOINDEX",
        "metric_type": "COSINE",
    }
    collection.create_index(
        field_name="vector", 
        index_params=index_params, 
        timeout=None
    )
except: # exisitng collection
    collection = Collection(name='lockedinai')

collection.load()

print(collection.describe())

# clean
for database in dbs:
    try:
        db.drop_database(database)
    except Exception as e:
        print(e)
for collection in collections:
    utility.drop_collection(collection)
for connection in connections_:
    connections.disconnect(connection[0])
    connections.remove_connection(connection[0])

RPC error: [drop_database], <MilvusException: (code=65535, message=can not drop default database)>, <Time:{'RPC start': '2024-04-14 03:41:36.016048', 'RPC error': '2024-04-14 03:41:36.017150'}>


[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x15276aea0>)]
['default']
['lockedinai']
UserInfo groups:
- UserItem: <username:root>, <roles:()>
{'collection_name': 'lockedinai', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'uid', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_partition_key': True}, {'field_id': 101, 'name': 'category', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'field_id': 102, 'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'field_id': 103, 'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 104, 'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}], 'aliases': [], 'collection_id': 449067066124558708, 'consistency_level': 1, 'properties': {}, 'num_partitions': 64, 'enabl

### Newer MilvusClient module (hides ORM complexity)

https://milvus.io/api-reference/pymilvus/v2.4.x/MilvusClient/Client/MilvusClient.md

In [15]:
from pymilvus import MilvusClient, DataType

client = MilvusClient(uri=CONNECTION_ARGS['uri']) # WARNING: remember to close
client.create_collection(
    COLLECTION_NAME,
    DIMENSION,
    primary_field_name="id",
    id_type=DataType.INT64,
    vector_field_name="vector",
    metric_type="COSINE",
    auto_id=False,
)

# info
collections = client.list_collections()
indexes = client.list_indexes(collections[0])
partitions = client.list_partitions(collections[0])
print(collections)
print(indexes)
print(partitions)

# clean
for collection in collections:
    client.drop_collection(collection_name=collections[0])

client.close()

['lockedinai']
['vector']
['_default']


### LangChain Milvus module

https://python.langchain.com/docs/integrations/vectorstores/milvus/

In [16]:
# if collection has never been initialized
docs_seed = [
    Document(page_content='', metadata={'uid': 00000000, 'category': 'domain knowledge'}),
    Document(page_content='', metadata={'uid': 00000000, 'category': 'personal context'})
]

vector_store = Milvus.from_documents(
    docs_seed, # seed
    embedding,
    collection_name=COLLECTION_NAME,
    connection_args=CONNECTION_ARGS,
    drop_old=True, # WARNING: deletes old collection
    partition_key_field="uid",
)

In [17]:
# if collections has been initialized
vector_store = Milvus(
    embedding,
    collection_name=COLLECTION_NAME,
    connection_args=CONNECTION_ARGS,
    drop_old=False, 
    auto_id=True,
)

In [18]:
vector_store.upsert(documents=docs)

[449067066123093172,
 449067066123093173,
 449067066123093174,
 449067066123093175,
 449067066123093176,
 449067066123093177,
 449067066123093178,
 449067066123093179,
 449067066123093180,
 449067066123093181,
 449067066123093182,
 449067066123093183,
 449067066123093184,
 449067066123093185,
 449067066123093186,
 449067066123093187,
 449067066123093188,
 449067066123093189,
 449067066123093190,
 449067066123093191,
 449067066123093192,
 449067066123093193,
 449067066123093194,
 449067066123093195,
 449067066123093196,
 449067066123093197]

In [19]:
connections.connect(alias='default', uri=CONNECTION_ARGS['uri'])
print(utility.list_collections())
print(connections.list_connections())

['lockedinai']
[('31ddd023f68d443795bfa7c21ab7c537', None), ('7f00684f04f04d4985962b6cc0d188aa', <pymilvus.client.grpc_handler.GrpcHandler object at 0x15532f710>), ('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x15532e810>)]


In [20]:
vector_store.as_retriever(
    search_kwargs={
        'expr': 'category == "domain knowledge" and uid == 3',
        'k': 2,
    }
).get_relevant_documents("explain ragfusion")

[Document(page_content='RAG-FUSION: A NEW TAKE ON RETRIEVAL-AUGMENTED\nGENERATION ∗\nZackary Rackauckas\nInfineon Technologies\nSan Jose, CA\nzackary.rackauckas@infineon.com\nABSTRACT\nInfineon has identified a need for engineers, account managers, and customers to rapidly obtain\nproduct information. This problem is traditionally addressed with retrieval-augmented generation\n(RAG) chatbots, but in this study, I evaluated the use of the newly popularized RAG-Fusion method.\nRAG-Fusion combines RAG and reciprocal rank fusion (RRF) by generating multiple queries,\nreranking them with reciprocal scores and fusing the documents and scores. Through manually\nevaluating answers on accuracy, relevance, and comprehensiveness, I found that RAG-Fusion was\nable to provide accurate and comprehensive answers due to the generated queries contextualizing the\noriginal query from various perspectives. However, some answers strayed off topic when the generated', metadata={'uid': 3, 'category': 'domai

In [21]:
vector_store.as_retriever(
    # search_type="similarity_score_threshold", NotImplemented
    search_kwargs={
        'expr': 'category == "personal context" and uid == 2',
        'k': 2,
        # 'score_threshold': 0.1, # NotImplemented
    }
).get_relevant_documents("explain ragfusion")

[Document(page_content='Deep Learning Research Scientist\nJan 2020 - Sept 2021\nBoston Fusion | Lexington, MA\nJuggled several SBIR and DoD research projects. Technical mentor for new hires. Gave company-wide technical presentations on\ntopics such as Reinforcement Learning and Graph Convolutional Networks.\nKey Accomplishments\n●\nDevised a reinforcement learning OpenAI-Gym-like framework for ROS (Robotics Operating System); trained a Dueling\nDouble DQN with Prioritized Experience Replay agent to successfully navigate a UAV to a target destination while\nsimultaneously avoiding moving obstacles.\n●\nImproved acoustic classification of spectrograms and MFCCs with ResNet architecture increasing 5-fold cross validation\naccuracy by 5%; upgraded the framework from single-label to multi-label classification; developed a Convolutional\nAutoencoder to detect anomalous acoustic signals; developed a Siamese Network to compare in-distribution data with\nout-of-distribution new acoustic signals

In [22]:
# vector_store.col.search??

In [23]:
# vector_store.as_retriever??

# LLM

In [24]:
# llm = GPT4All(
#     model="./mistral-7b-openorca.gguf2.Q4_0",
#     max_tokens=2048,
#     temp=1, # 1) exp(x_i/T) / sum(exp(x_j/T))
#     # top_k=100, # 2) after temp, sort then select top k, normalize
#     # top_p=.5, # 3) after top_k, select until cum prob is reached, normalize
#     repeat_penalty=1.18,
#     repeat_last_n=64,
#     n_batch=8,
#     n_predict=None,
#     streaming=False,
# )

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [25]:
# without RAG
question = "Teach me how RAG fusion works."
llm.invoke(f"You are in an interview.\nQuestion: {question}\nAnswer:")

AIMessage(content='RAG fusion is a gene fusion technique used in molecular biology to study gene expression and regulation. It involves combining the regulatory region of one gene with the coding region of another gene to create a chimeric gene with altered expression patterns. This technique can help researchers understand how genes are regulated and how changes in gene expression can lead to disease. The RAG fusion process typically involves cloning the regulatory region of one gene and the coding region of another gene into a plasmid vector, which is then transfected into cells for expression studies. By studying the expression patterns of the chimeric gene, researchers can gain insights into the functions of the individual genes and how they work together in various biological processes.', response_metadata={'token_usage': {'completion_tokens': 140, 'prompt_tokens': 25, 'total_tokens': 165}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason':

In [26]:
uid = 3
question = "Teach me about RAG fusion."

prompt = PromptTemplate.from_template(
"""You are being interviewed.

For technical questions, feel free to refer to "Domain Knowledge".
For behavioral questions, feel free to refer to "Personal Context"; use STAR if there is an opportunity.

Domain Knowledge: {domain_knowledge}

Personal Context: {personal_context}

Question: {question}

Answer:
"""
)

retriever_dk = vector_store.as_retriever(
    # search_type="similarity_score_threshold", # NotImplemented,
    search_kwargs={
        'expr': f'category == "domain knowledge" and uid == {uid}',
        'k': 4,
        # 'score_threshold': 0.1, # NotImplemented
    }
)

retriever_pc = vector_store.as_retriever(
    # search_type="similarity_score_threshold", # NotImplemented
    search_kwargs={
        'expr': f'category == "personal context" and uid == {uid}',
        'k': 4,
        # 'score_threshold': 0.1, # NotImplemented
    }
)

retrieval_chain = (
    {"personal_context": retriever_pc, "domain_knowledge": retriever_dk, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

retrieval_chain.invoke(question)

'RAG-Fusion is a new approach that combines retrieval-augmented generation (RAG) with reciprocal rank fusion (RRF) to improve the accuracy and comprehensiveness of answers provided by virtual assistants or chatbots. In traditional RAG methods, documents are ranked by relevance to the query based on vector distances. RAG-Fusion, on the other hand, generates multiple queries, reranks them using reciprocal scores, and then fuses the documents and scores to provide more contextualized answers.\n\nThrough manual evaluation of answers in terms of accuracy, relevance, and comprehensiveness, it has been found that RAG-Fusion can provide more accurate and comprehensive answers compared to traditional methods. By generating multiple queries and considering different perspectives, RAG-Fusion helps to contextualize the original query, leading to more informative responses.\n\nOverall, RAG-Fusion represents a significant advancement in artificial intelligence and natural language processing applica