# VectorSearch with ChromaDB

ChromaDB - Langchain - Shakespeare Hamlet Example

VectorSearch-ChromaDB
https://www.kaggle.com/code/toddgardiner/vectorsearch-chromadb/notebook

In [1]:
# #use this version of chromadb
# !pip install chromadb==0.5.3 

# #get langchain stuff
# !pip install langchain langchain-community langchain_huggingface

In [2]:
import os
import sys
import pandas as pd

from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings  
from langchain_core.documents import  Document

hf_key = open("/Users/mjack6/.secrets/hugginface_mjack.apikey", "r").read().strip()

os.environ['HF_TOKEN'] = hf_key 



In [3]:
#define some arguments for the embeddings models
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
modelsetter = "sentence-transformers/all-mpnet-base-v2"

#function to build the chromadb, default model is overwritten when we call this function below (w modelsetter var)
def create_vector_db(chunks, model_path="intfloat/multilingual-e5-base"):
    # define embeddings model call
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    
    # turn string chunks into docs
    docs = []
    for i in range(len(chunks)):
        docs.append(Document(page_content=chunks[i],metadata={'source':f'hamlet chunk {i}'}))
    
    # build the chromadb
    db = Chroma.from_documents(docs, embedding=embeddings, collection_name = 'hamlet' , persist_directory = '/Users/mjack6/chromadb/working',)
    return db

In [4]:
#read in the dataset
plays = pd.read_csv('data/Shakespeare_data.csv')

#get hamlet from the larger plays corpus
hamlet = plays.loc[plays['Play']=='Hamlet'].sort_values(by=['Dataline'], axis=0)
hamlet.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
32432,32433,Hamlet,138.0,,CYMBELINE,ACT I
32433,32434,Hamlet,138.0,,CYMBELINE,SCENE I. Elsinore. A platform before the castle.
32434,32435,Hamlet,138.0,,CYMBELINE,FRANCISCO at his post. Enter to him BERNARDO
32435,32436,Hamlet,1.0,1.1.1,BERNARDO,Who's there?
32436,32437,Hamlet,2.0,1.1.2,FRANCISCO,"Nay, answer me: stand, and unfold yourself."


In [5]:
#reset the index to 0->
hamlet.reset_index(inplace=True,drop=True)
hamlet.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,32433,Hamlet,138.0,,CYMBELINE,ACT I
1,32434,Hamlet,138.0,,CYMBELINE,SCENE I. Elsinore. A platform before the castle.
2,32435,Hamlet,138.0,,CYMBELINE,FRANCISCO at his post. Enter to him BERNARDO
3,32436,Hamlet,1.0,1.1.1,BERNARDO,Who's there?
4,32437,Hamlet,2.0,1.1.2,FRANCISCO,"Nay, answer me: stand, and unfold yourself."


In [6]:
#see inside the play. do characters speak multiple lines at once?
hamlet[['Player','PlayerLine']][100:120]

Unnamed: 0,Player,PlayerLine
100,HORATIO,"Whose image even but now appear'd to us,"
101,HORATIO,"Was, as you know, by Fortinbras of Norway,"
102,HORATIO,"Thereto prick'd on by a most emulate pride,"
103,HORATIO,"Dared to the combat, in which our valiant Haml..."
104,HORATIO,For so this side of our known world esteem'd h...
105,HORATIO,"Did slay this Fortinbras, who by a seal'd comp..."
106,HORATIO,"Well ratified by law and heraldry,"
107,HORATIO,"Did forfeit, with his life, all those his lands"
108,HORATIO,"Which he stood seized of, to the conqueror:"
109,HORATIO,"Against the which, a moiety competent"


In [7]:
# yes, they do. So we need to add character only once per oration...

#list for corpus
corpus = []

# dummy variable for character/player
lastplayer = ''

# loop the dataset into the list
for i in range(len(hamlet)):
    
    #don't repeat the player for every line
    if lastplayer == hamlet['Player'][i]:
        corpus.append(hamlet['PlayerLine'][i])
    
    #but do put the player in if there is a change
    else: 
        corpus.append(hamlet['Player'][i])
        corpus.append(hamlet['PlayerLine'][i])
    
    #set player for loop
    lastplayer = hamlet['Player'][i]

# join all the list and add spaces...
corp = " ".join(corpus)

In [8]:
#check the output
corp[0:500]

"CYMBELINE ACT I SCENE I. Elsinore. A platform before the castle. FRANCISCO at his post. Enter to him BERNARDO BERNARDO Who's there? FRANCISCO Nay, answer me: stand, and unfold yourself. BERNARDO Long live the king! FRANCISCO Bernardo? BERNARDO He. FRANCISCO You come most carefully upon your hour. BERNARDO 'Tis now struck twelve, get thee to bed, Francisco. FRANCISCO For this relief much thanks: 'tis bitter cold, And I am sick at heart. BERNARDO Have you had quiet guard? FRANCISCO Not a mouse sti"

In [9]:
# how long is the corp
len(corp)

174880

In [10]:
question = "I've to be mean to be nice, so bad starts and worse stays. And another thing good girl." #looking for "Thus bad begins, and worse remains behind"
print('Length of Question: ',len(question))

Length of Question:  87


In [11]:
#chunk the corpus into ~2x question size +/- 50% = ~4x sized chunks
setsize= 128 # size of chunks
setoffset = 56 #offset size ~50%

#even math or no?
if len(corp)%setsize == 0:
    chunks = len(corp)/setsize
else:
    chunks = len(corp)//setsize
    chunks += 1

print(f"Computed Chunks = {chunks}")

# chunk it out into the chus list
chus = []
for i in range(chunks):
    #set start position
    if i > 0:
        startpos = i * setsize - setoffset
    else:
        startpos = i * setsize
    #set endposition
    if i < chunks - 1:
        endpos = i * setsize + setsize + setoffset
    else: 
        endpos = i * setsize + setsize
        
    # catch errors that extend past corpus with offset
    if endpos > len(corp):
        endpos = len(corp)-1
    
    #add chunks to the chus list
    chus.append(corp[startpos:endpos])

Computed Chunks = 1367


In [12]:
#check our chunks --> how many? do they match?
len(chus)

1367

In [13]:
#lets embed them and stuff them in a chroma db (note the defined model override)
chrdb = ''
chrdb = create_vector_db(chus, model_path=modelsetter)

In [14]:
#make an embedding for the question 

qembeddings = HuggingFaceEmbeddings(
        
        model_name= modelsetter,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
qembed = qembeddings.embed_query(question)
qembed[0:4]

[0.05348237231373787,
 0.008560621179640293,
 -0.0029518732335418463,
 -0.0014147310284897685]

In [15]:
# query the db and find the top k results
results = chrdb.similarity_search_by_vector(    
    embedding = qembed,
    k = 3
)
for i in results:
    print("------------------------------",i)

------------------------------ page_content=' be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE What shall I do? HAMLET Not this, by no means, that I bid you do: Let the bloat king tempt you again to bed, Pinch wanton on your' metadata={'source': 'hamlet chunk 868'}
------------------------------ page_content=' be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE What shall I do? HAMLET Not this, by no means, that I bid you do: Let the bloat king tempt you again to bed, Pinch wanton on your' metadata={'source': 'hamlet chunk 868'}
------------------------------ page_content=' must be their scourge and minister. I will bestow him, and will answer well The death I gave him. So, again, good night. I must be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE W' metadata={'source': 'hamlet chunk 867'}


In [16]:
# the chunk numbers are in the results[i].metadata['source'] so let's get those out
# and then print the outputs neatly

for j in range(len(results)):
    #get the chunk number
    cn = "".join([x for x in results[j].metadata['source'] if x.isnumeric()])
    print(f"Chunk Number {cn}")
    print(chus[int(cn)])
    print('\n\n')

Chunk Number 868
 be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE What shall I do? HAMLET Not this, by no means, that I bid you do: Let the bloat king tempt you again to bed, Pinch wanton on your



Chunk Number 868
 be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE What shall I do? HAMLET Not this, by no means, that I bid you do: Let the bloat king tempt you again to bed, Pinch wanton on your



Chunk Number 867
 must be their scourge and minister. I will bestow him, and will answer well The death I gave him. So, again, good night. I must be cruel, only to be kind: Thus bad begins and worse remains behind. One word more, good lady. QUEEN GERTRUDE W



