# Create Vector Database

## Load data

In [4]:
import pandas as pd

Xy = pd.read_csv('./stash/Xy.csv')
X = Xy[Xy.columns.drop(['title', 'y'])]
t = Xy['title']
y = Xy['y']

Xy.shape, X.shape, t.shape, y.shape

((90, 1538), (90, 1536), (90,), (90,))

In [11]:
df = pd.DataFrame({
    'id': list(range(Xy.shape[0])),
    'document': t,
    'metadata': [{'subject': _y} for _y in y],
    'embedding': [[float(_x) for _x in _v] for _v in X.values]
})

In [13]:
df.head()

Unnamed: 0,id,document,metadata,embedding
0,0,Java: The Complete Reference,{'subject': 'java'},"[0.0022508783731609, -0.0042110048234462, 0.01..."
1,1,Core Java An Integrated Approach (Black Book),{'subject': 'java'},"[0.0083073740825057, -0.0205343309789896, 0.01..."
2,2,Head First Java,{'subject': 'java'},"[0.0109464712440967, 0.0101075824350118, -0.00..."
3,3,Effective Java,{'subject': 'java'},"[-0.0103182280436158, 0.0071102487854659, 0.00..."
4,4,Thinking in Java,{'subject': 'java'},"[-0.0012556132860481, 0.0025670316535979, 0.01..."


## Create vector database directory

In [14]:
import chromadb
from chromadb.config import Settings
import shutil

def delete_folder(path):
    try:
        shutil.rmtree(path)
    except:
        ...

delete_folder('./vdb')

client = chromadb.PersistentClient(
    path='./vdb', 
    settings=Settings(anonymized_telemetry=False)
)

### Create collection

In [15]:
collection = client.create_collection(
    name='books',
    get_or_create=True
)

In [16]:
collection.add(
    documents=df['document'].to_list(),
    embeddings=df['embedding'].to_list(),
    metadatas=df['metadata'].to_list(),
    ids=df['id'].apply(str).to_list()
)

In [17]:
collection.count()

90

In [18]:
collection.schema()

{'title': 'Collection',
 'type': 'object',
 'properties': {'name': {'title': 'Name', 'type': 'string'},
  'id': {'title': 'Id', 'type': 'string', 'format': 'uuid'},
  'metadata': {'title': 'Metadata', 'type': 'object'},
  'tenant': {'title': 'Tenant', 'type': 'string'},
  'database': {'title': 'Database', 'type': 'string'}},
 'required': ['name', 'id']}

### Query

In [23]:
import pandas as pd
from openai import OpenAI

def embed_func(docs, model='text-embedding-ada-002'):
    openai_client = OpenAI()
    docs = [d.replace('\n', ' ') for d in docs]
    res = openai_client.embeddings.create(input=docs, model=model)
    return [d.embedding for d in res.data]

def get_query(txt):
    return embed_func([txt])

def query(txt, n_results=5):
    return collection.query(
        query_embeddings=get_query(txt),
        n_results=n_results,
        include=['metadatas', 'documents', 'distances', 'embeddings']
    )

query('java')

{'ids': [['6', '3', '4', '2', '5']],
 'distances': [[0.23774599455593265,
   0.24705099015966472,
   0.2807672522642387,
   0.28923322470975776,
   0.29458112753399557]],
 'metadatas': [[{'subject': 'java'},
   {'subject': 'java'},
   {'subject': 'java'},
   {'subject': 'java'},
   {'subject': 'java'}]],
 'embeddings': [[[0.0035116856452077,
    -0.0013005638029426,
    0.0069776801392436,
    -0.0181981027126312,
    -0.0171798449009656,
    0.0156655125319957,
    -0.0254172924906015,
    -0.0130284838378429,
    -0.0162660237401723,
    -0.0301300007849931,
    0.0056591662578284,
    0.0136289950460195,
    0.0133026307448744,
    -0.0053393286652863,
    -0.013563722372055,
    -0.0072387717664241,
    0.0162529684603214,
    -0.0209134593605995,
    0.016853479668498,
    0.0125911552459001,
    -0.0048367269337177,
    0.0310177132487297,
    0.010548111051321,
    0.0037368773482739,
    -0.003619385883212,
    0.0175062101334333,
    0.005270792171359,
    -0.0361090041697025,