# Create Vector DB with Langchain

## Import data

In [1]:
import pandas as pd

Xy = pd.read_csv('./stash/Xytc.csv')
X = Xy[Xy.columns.drop(['title', 'y', 'c'])]
y = Xy['y']
t = Xy['title']
c = Xy['c']

Xy.shape, X.shape, t.shape, y.shape

((90, 1539), (90, 1536), (90,), (90,))

In [2]:
df = pd.DataFrame({
    'id': list(range(Xy.shape[0])),
    'document': t,
    'metadata': [{'subject': _y, 'topic': _c} for _y, _c in zip(y, c)],
    'embedding': [[float(_x) for _x in _v] for _v in X.values]
})
df.shape

(90, 4)

In [3]:
df.head()

Unnamed: 0,id,document,metadata,embedding
0,0,The Hundred-Page Machine Learning Book,"{'subject': 'machine_learning', 'topic': 7}","[-0.0195070374757051, 0.0134451435878872, 0.01..."
1,1,Programming Python,"{'subject': 'python', 'topic': 11}","[-2.161007796530612e-05, -0.012101643718779, 0..."
2,2,The Java Language Specification,"{'subject': 'java', 'topic': 12}","[0.0095693012699484, 0.0100918794050812, 0.015..."
3,3,Artificial Intelligence: A Modern Approach,"{'subject': 'computer_science', 'topic': 7}","[-0.0194937139749526, -0.0022757838014513, 0.0..."
4,4,Speaking JavaScript: An In-Depth Guide for Pro...,"{'subject': 'javascript', 'topic': 5}","[0.008368537761271, 0.0069584059529006, 0.0078..."


In [4]:
t2e = {r['document']: r['embedding'] for _, r in df.iterrows()}
len(t2e)

88

## Create vector database

In [5]:
import pathlib
import shutil
from langchain_core.documents.base import Document
from langchain.vectorstores import Chroma
from openai import OpenAI

class MockEmbedding:
    def __init__(self, t2e, embedding_model='text-embedding-ada-002'):
        self.t2e = t2e
        self.embedding_model = embedding_model
        self.client = OpenAI()

    def __embed(self, t):
        if t in self.t2e:
            return self.t2e[t]
            
        docs = [t.replace('\n', ' ')]
        res = self.client.embeddings.create(input=docs, model=self.embedding_model)
        return res.data[0].embedding
        
    def embed_documents(self, texts):
        return [self.__embed(t) for t in texts]

    def embed_query(self, query):
        return self.__embed(query)
        
def get_documents():
    def r2d(r):
        document = Document(page_content=r['document'], metadata=r['metadata'])
        return document
        
    return df.apply(r2d, axis=1).tolist()

def get_db(db_path=pathlib.Path('./book_vdb')):
    documents = get_documents()
    embedding_function = MockEmbedding(t2e=t2e)
    
    if db_path.exists():
        db = Chroma(
            persist_directory=str(db_path), 
            embedding_function=embedding_function
        )
    else:
        db_path.mkdir(parents=True, exist_ok=True)
        db = Chroma.from_documents(
            documents=documents, 
            embedding=embedding_function, 
            persist_directory=str(db_path)
        )

    return db

def get_retriever(db_path=pathlib.Path('./book_vdb'), retriever_params=None):
    if retriever_params is None:
        retriever_params = {
            'search_type': 'mmr',
            'search_kwargs': {
                'k': 5,
                'fetch_k': 100,
                'lambda_mult': 0.5,
                'score_threshold': 0.2
            }
        }
        
    return get_db(db_path).as_retriever(**retriever_params)
    
retriever = get_retriever()
vectorstore = retriever.vectorstore

## Query

In [6]:
retriever.invoke('java', n_results=10)

Number of requested results 100 is greater than number of elements in index 90, updating n_results = 90


[Document(page_content='The Java™ Programming Language', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Pro AngularJS', metadata={'subject': 'web', 'topic': 9}),
 Document(page_content='Node.js Design Patterns', metadata={'subject': 'web', 'topic': 13}),
 Document(page_content='Deep Learning', metadata={'subject': 'machine_learning', 'topic': 7}),
 Document(page_content='Security Analysis', metadata={'subject': 'finance', 'topic': 8})]

In [7]:
vectorstore.search('java', search_type='similarity')

[Document(page_content='The Java™ Programming Language', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Effective Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Thinking in Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Head First Java', metadata={'subject': 'java', 'topic': 12})]

In [8]:
vectorstore.search('java', search_type='mmr')

[Document(page_content='The Java™ Programming Language', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Effective Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='JavaScript: The Good Parts', metadata={'subject': 'javascript', 'topic': 13}),
 Document(page_content='Python Crash Course', metadata={'subject': 'python', 'topic': 11})]