# Search and Retrieval

## Setup

In [1]:
from curate_gpt.store import get_store

# assumes pre-populated
store = get_store("chromadb", "../../db")

## Similarity Search

Searches based on embedding of text in query against vector database

In [2]:
for obj, distance, info in store.search("forebrain neurons", collection="ont_cl"):
    print(f"* {obj['id']}\n    - Distance: {distance} Text: {info['document'][0:20]}...")

* ForebrainNeuronDevelopment
    - Distance: 0.2577448785305023 Text: forebrain neuron dev...
* ForebrainGenerationOfNeurons
    - Distance: 0.2579804062843323 Text: forebrain generation...
* ForebrainNeuronDifferentiation
    - Distance: 0.27386343479156494 Text: forebrain neuron dif...
* NeuronOfTheForebrain
    - Distance: 0.2808114290237427 Text: neuron of the forebr...
* ForebrainNeuroblastDifferentiation
    - Distance: 0.2961786091327667 Text: forebrain neuroblast...
* BasalForebrain
    - Distance: 0.3035440444946289 Text: basal forebrain A re...
* Forebrain
    - Distance: 0.30516189336776733 Text: forebrain The most a...
* NeuroblastDivisionInSubpallium
    - Distance: 0.3056851625442505 Text: neuroblast division ...
* ForebrainNeuralPlate
    - Distance: 0.3136727511882782 Text: forebrain neural pla...
* ForebrainDevelopment
    - Distance: 0.31519147753715515 Text: forebrain developmen...


## Retrieval

Boolean queries

In [35]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl"))
objs

[({'id': 'NeuronOfTheForebrain',
   'label': 'neuron of the forebrain',
   'definition': 'A CNS neuron of the forebrain.',
   'aliases': None,
   'relationships': [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'},
    {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'},
    {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}],
   'logical_definition': [{'predicate': 'rdfs:subClassOf',
     'target': 'CNSNeuron_sensuVertebrata_'},
    {'predicate': 'HasSomaLocation', 'target': 'Forebrain'}],
   'original_id': 'CL:0012001'},
  0.0,
  {'document': "neuron of the forebrain A CNS neuron of the forebrain. [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'}, {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'}, {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}]"})]

## Including Vector Embeddings in results



In [14]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
_, __, info = objs[0]
info["_embeddings"][0:20]

[-0.00770607078447938,
 0.009014262817800045,
 -0.0028122728690505028,
 0.014608148485422134,
 -0.014907942153513432,
 0.004258438479155302,
 0.007985424250364304,
 0.0027458411641418934,
 -0.011058313772082329,
 -0.018559979274868965,
 -0.0027458411641418934,
 0.024256067350506783,
 0.0012409089831635356,
 0.0037303923163563013,
 0.006833942607045174,
 0.009804628789424896,
 0.036438606679439545,
 -0.0029638733249157667,
 -0.00591411953791976,
 -0.00024549898807890713]

## Lookup by ID

Assumes that there is an identifier column

In [6]:
store.lookup("NeuronOfTheForebrain", collection="ont_cl")

{'id': 'NeuronOfTheForebrain',
 'label': 'neuron of the forebrain',
 'definition': 'A CNS neuron of the forebrain.',
 'aliases': None,
 'relationships': [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'},
  {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'},
  {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}],
 'logical_definition': [{'predicate': 'rdfs:subClassOf',
   'target': 'CNSNeuron_sensuVertebrata_'},
  {'predicate': 'HasSomaLocation', 'target': 'Forebrain'}],
 'original_id': 'CL:0012001'}

## All by All

In [15]:
objs = list(store.find({}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
len(objs)

16121

In [32]:
import numpy as np

def compute_cosine_similarity(list1, list2):
    # Convert lists to numpy arrays
    matrix1 = np.array(list1)
    matrix2 = np.array(list2)
    
    # Normalize the vectors in both matrices
    matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
    matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
    
    # Compute dot products (resulting in cosine similarity values)
    cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
    
    return cosine_similarity_matrix

# Sample lists of vectors
list1 = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
list2 = [[7, 8, 9], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

cosine_similarity_matrix = compute_cosine_similarity(list1, list2)
cosine_similarity_matrix


array([[0.95941195, 0.26726124, 0.53452248, 0.80178373],
       [0.99819089, 0.45584231, 0.56980288, 0.68376346],
       [1.        , 0.50257071, 0.57436653, 0.64616234]])

In [33]:
cosine_similarity_matrix[2][0]

1.0

In [17]:
vectors = [info["_embeddings"] for _, __, info in objs]

In [18]:
cosine_similarity_matrix = compute_cosine_similarity(vectors, vectors)

In [19]:
len(cosine_similarity_matrix)

16121

In [20]:
def top_matches(cosine_similarity_matrix):
    # Find the index of the maximum value in each row
    top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
    
    # Find the maximum similarity value in each row
    top_match_values = np.amax(cosine_similarity_matrix, axis=1)
    
    return top_match_indices, top_match_values

In [34]:
list1 = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
list2 = [[0.5, 0.5, 0.5], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Re-compute the cosine similarity matrix
test_matrix = compute_cosine_similarity(list1, list2)

# Find the top matches
top_indices, top_values = top_matches(test_matrix)
top_indices, top_values

(array([3, 2, 1]), array([1., 1., 1.]))

In [22]:
tm_ix, tm_vals = top_matches(cosine_similarity_matrix)
len(tm_ix)

16121

In [24]:
tm_ix[0:5]

array([0, 1, 2, 3, 4])

In [25]:
tm_vals[0:5]

array([1., 1., 1., 1., 1.])

## Latency Check

In [74]:
store = get_store("chromadb", "../../db")
client = store.client

In [75]:
cxn = client.get_collection("ont_ecosim")

In [76]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])

In [77]:
len(x["embeddings"])

1690

In [78]:
len(x["embeddings"][0])

1536

In [69]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])

## DuckDBVSSAdapter

In [1]:
from curate_gpt.store import get_store
duck_store = get_store("duckdb_vss", "./duckdb3.db")

In [2]:
duck_store.list_collection_names()

[]

In [3]:
obj_to_insert = {
    'id': '10MinuteAPGARScoreOf0',
    'label': '10-minute APGAR score of 0',
    'definition': None,
    'aliases': None,
    'relationships': [{'predicate': 'subClassOf', 'target': 'Low10MinuteAPGARScore'}],
    'logical_definition': None,
    'original_id': 'HP:0033468'
}

In [4]:
duck_store.insert([obj_to_insert], collection="test_collection")
duck_store.insert([{"id": "test7", "label": "This is no test"}])
duck_store.insert([{"id": "test2", "label": "This is no test"}])
duck_store.insert([{"id": "test3", "label": "This is a test"}])
duck_store.insert([{"id": "test4", "label": "This is a test"}])
duck_store.insert([{"id": "test5", "label": "This is no test"}])
duck_store.insert([{"id": "test6", "label": "This is no test"}])



In [5]:
import itertools
h = duck_store.matches({"id":"test3"}, collection="test_collection")
v = list(itertools.islice(h, 5))
v

[{
   "id": "10MinuteAPGARScoreOf0",
   "metadata": {
     "id": "10MinuteAPGARScoreOf0",
     "label": "10-minute APGAR score of 0",
     "definition": null,
     "aliases": null,
     "relationships": [
       {
         "predicate": "subClassOf",
         "target": "Low10MinuteAPGARScore"
       }
     ],
     "logical_definition": null,
     "original_id": "HP:0033468"
   },
   "embeddings": [
     -0.0467611625790596,
     0.09052881598472595,
     -0.034534912556409836,
     -0.019270261749625206,
     -0.03490646183490753,
     0.10417038947343826,
     0.03309764340519905,
     0.021112071350216866,
     0.028908856213092804,
     -0.06762091815471649,
     0.0714174211025238,
     -0.12092143297195435,
     -0.00558472378179431,
     -0.03554216027259827,
     -0.021002447232604027,
     0.09251294285058975,
     0.003853082424029708,
     -0.03027546964585781,
     -0.036024533212184906,
     -0.09964665025472641,
     0.040245719254016876,
     0.05557608976960182,
     0.01

In [6]:
peeklist = list(duck_store.peek("test_collection", include=["metadatas", "documents"]))
print(type(peeklist))
peeklist

<class 'list'>


[{
   "id": "__metadata__",
   "metadata": {
     "name": "test_collection",
     "model": "all-MiniLM-L6-v2"
   },
   "embeddings": null,
   "documents": null,
   "distance": null
 },
 {
   "id": "10MinuteAPGARScoreOf0",
   "metadata": {
     "id": "10MinuteAPGARScoreOf0",
     "label": "10-minute APGAR score of 0",
     "definition": null,
     "aliases": null,
     "relationships": [
       {
         "predicate": "subClassOf",
         "target": "Low10MinuteAPGARScore"
       }
     ],
     "logical_definition": null,
     "original_id": "HP:0033468"
   },
   "embeddings": [
     -0.0467611625790596,
     0.09052881598472595,
     -0.034534912556409836,
     -0.019270261749625206,
     -0.03490646183490753,
     0.10417038947343826,
     0.03309764340519905,
     0.021112071350216866,
     0.028908856213092804,
     -0.06762091815471649,
     0.0714174211025238,
     -0.12092143297195435,
     -0.00558472378179431,
     -0.03554216027259827,
     -0.021002447232604027,
     0.09251

In [7]:
o = list(duck_store.find({"id": "test3"}, collection="test_collection"))
o

[{
   "id": "test3",
   "metadata": {
     "id": "test3",
     "label": "This is a test"
   },
   "embeddings": [
     -0.05806167051196098,
     0.058285512030124664,
     -0.11568114161491394,
     0.041325975209474564,
     0.05392612889409065,
     0.04977672919631004,
     0.14893536269664764,
     -0.054948654025793076,
     0.03722340613603592,
     -0.09625576436519623,
     0.05694819614291191,
     -0.07849280536174774,
     0.0009167090174742043,
     -0.03686182200908661,
     0.014377027750015259,
     -0.006707538850605488,
     -0.021141791716217995,
     -0.03004765696823597,
     -0.05046338215470314,
     -0.06385001540184021,
     -0.02089228294789791,
     0.05747392773628235,
     0.05536378175020218,
     0.05975782498717308,
     -0.03910595551133156,
     0.020993299782276154,
     -0.018639154732227325,
     0.02366769313812256,
     0.022258255630731583,
     -0.02760941907763481,
     0.06982947885990143,
     0.06326280534267426,
     -0.007301160600036383,


In [8]:
x = duck_store.lookup("test1", collection="test_collection")
x

In [9]:
x = duck_store.find({"id": "test2"}, collection="test_collection")
list(x)

[{
   "id": "test2",
   "metadata": {
     "id": "test2",
     "label": "This is no test"
   },
   "embeddings": [
     -0.009685405530035496,
     0.0626065582036972,
     -0.07494169473648071,
     0.03725627809762955,
     0.04079523682594299,
     0.039620302617549896,
     0.07771033048629761,
     -0.01656809262931347,
     0.012735242955386639,
     -0.10979384183883667,
     0.07689767330884933,
     -0.0932522714138031,
     -0.011944085359573364,
     -0.0295057725161314,
     0.023117193952202797,
     0.013776317238807678,
     -0.022287830710411072,
     -0.06752121448516846,
     -0.007656003348529339,
     -0.05927116051316261,
     -0.045386068522930145,
     0.05243339389562607,
     0.06196647137403488,
     0.01444204617291689,
     -0.0518176443874836,
     0.014731470495462418,
     -0.06356256455183029,
     0.036190494894981384,
     -0.0042460085824131966,
     -0.04265131428837776,
     0.1007007509469986,
     0.07054686546325684,
     -0.0767286866903305,
   

In [10]:
x = list(duck_store.find({}, collection="test_collection"))
x

[{
   "id": "__metadata__",
   "metadata": {
     "name": "test_collection",
     "model": "all-MiniLM-L6-v2"
   },
   "embeddings": null,
   "documents": null,
   "distance": null
 },
 {
   "id": "10MinuteAPGARScoreOf0",
   "metadata": {
     "id": "10MinuteAPGARScoreOf0",
     "label": "10-minute APGAR score of 0",
     "definition": null,
     "aliases": null,
     "relationships": [
       {
         "predicate": "subClassOf",
         "target": "Low10MinuteAPGARScore"
       }
     ],
     "logical_definition": null,
     "original_id": "HP:0033468"
   },
   "embeddings": [
     -0.0467611625790596,
     0.09052881598472595,
     -0.034534912556409836,
     -0.019270261749625206,
     -0.03490646183490753,
     0.10417038947343826,
     0.03309764340519905,
     0.021112071350216866,
     0.028908856213092804,
     -0.06762091815471649,
     0.0714174211025238,
     -0.12092143297195435,
     -0.00558472378179431,
     -0.03554216027259827,
     -0.021002447232604027,
     0.09251

In [11]:
search_result = list(duck_store.search("test", collection="test_collection"))

In [12]:
len(search_result)

7

In [13]:
search_result[0]

{
  "id": "10MinuteAPGARScoreOf0",
  "metadata": {
    "id": "10MinuteAPGARScoreOf0",
    "label": "10-minute APGAR score of 0",
    "definition": null,
    "aliases": null,
    "relationships": [
      {
        "predicate": "subClassOf",
        "target": "Low10MinuteAPGARScore"
      }
    ],
    "logical_definition": null,
    "original_id": "HP:0033468"
  },
  "embeddings": [
    -0.0467611625790596,
    0.09052881598472595,
    -0.034534912556409836,
    -0.019270261749625206,
    -0.03490646183490753,
    0.10417038947343826,
    0.03309764340519905,
    0.021112071350216866,
    0.028908856213092804,
    -0.06762091815471649,
    0.0714174211025238,
    -0.12092143297195435,
    -0.00558472378179431,
    -0.03554216027259827,
    -0.021002447232604027,
    0.09251294285058975,
    0.003853082424029708,
    -0.03027546964585781,
    -0.036024533212184906,
    -0.09964665025472641,
    0.040245719254016876,
    0.05557608976960182,
    0.01469405647367239,
    0.03041153959929943

In [13]:
search_result[1]

{
  "id": "test3",
  "metadata": {
    "id": "test3",
    "label": "This is a test"
  },
  "embeddings": [
    -0.05806167051196098,
    0.058285512030124664,
    -0.11568114161491394,
    0.041325975209474564,
    0.05392612889409065,
    0.04977672919631004,
    0.14893536269664764,
    -0.054948654025793076,
    0.03722340613603592,
    -0.09625576436519623,
    0.05694819614291191,
    -0.07849280536174774,
    0.0009167090174742043,
    -0.03686182200908661,
    0.014377027750015259,
    -0.006707538850605488,
    -0.021141791716217995,
    -0.03004765696823597,
    -0.05046338215470314,
    -0.06385001540184021,
    -0.02089228294789791,
    0.05747392773628235,
    0.05536378175020218,
    0.05975782498717308,
    -0.03910595551133156,
    0.020993299782276154,
    -0.018639154732227325,
    0.02366769313812256,
    0.022258255630731583,
    -0.02760941907763481,
    0.06982947885990143,
    0.06326280534267426,
    -0.007301160600036383,
    -0.0334455743432045,
    0.081818275

In [14]:
import json

for i, res in enumerate(search_result):
    obj, distance, _meta = res.metadata, res.distance, res.metadata
    print(f"## {i} DISTANCE: {distance}")
    print(json.dumps(obj, indent=2))
    if i >= 2:  
        break

## 0 DISTANCE: 0.2383926659822464
{
  "id": "10MinuteAPGARScoreOf0",
  "label": "10-minute APGAR score of 0",
  "definition": null,
  "aliases": null,
  "relationships": [
    {
      "predicate": "subClassOf",
      "target": "Low10MinuteAPGARScore"
    }
  ],
  "logical_definition": null,
  "original_id": "HP:0033468"
}
## 1 DISTANCE: 0.4408803880214691
{
  "id": "test2",
  "label": "This is no test"
}
## 2 DISTANCE: 0.4502813220024109
{
  "id": "test5",
  "label": "This is no test"
}
