# Search and Retrieval

## Setup

In [1]:
from curate_gpt.store import get_store

# assumes pre-populated
store = get_store("chromadb", "../../db")

## Similarity Search

Searches based on embedding of text in query against vector database

In [2]:
for obj, distance, info in store.search("forebrain neurons", collection="ont_cl"):
    print(f"* {obj['id']}\n    - Distance: {distance} Text: {info['document'][0:20]}...")

* ForebrainNeuronDevelopment
    - Distance: 0.2577448785305023 Text: forebrain neuron dev...
* ForebrainGenerationOfNeurons
    - Distance: 0.2579804062843323 Text: forebrain generation...
* ForebrainNeuronDifferentiation
    - Distance: 0.27386343479156494 Text: forebrain neuron dif...
* NeuronOfTheForebrain
    - Distance: 0.2808114290237427 Text: neuron of the forebr...
* ForebrainNeuroblastDifferentiation
    - Distance: 0.2961786091327667 Text: forebrain neuroblast...
* BasalForebrain
    - Distance: 0.3035440444946289 Text: basal forebrain A re...
* Forebrain
    - Distance: 0.30516189336776733 Text: forebrain The most a...
* NeuroblastDivisionInSubpallium
    - Distance: 0.3056851625442505 Text: neuroblast division ...
* ForebrainNeuralPlate
    - Distance: 0.3136727511882782 Text: forebrain neural pla...
* ForebrainDevelopment
    - Distance: 0.31519147753715515 Text: forebrain developmen...


## Retrieval

Boolean queries

In [35]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl"))
objs

[({'id': 'NeuronOfTheForebrain',
   'label': 'neuron of the forebrain',
   'definition': 'A CNS neuron of the forebrain.',
   'aliases': None,
   'relationships': [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'},
    {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'},
    {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}],
   'logical_definition': [{'predicate': 'rdfs:subClassOf',
     'target': 'CNSNeuron_sensuVertebrata_'},
    {'predicate': 'HasSomaLocation', 'target': 'Forebrain'}],
   'original_id': 'CL:0012001'},
  0.0,
  {'document': "neuron of the forebrain A CNS neuron of the forebrain. [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'}, {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'}, {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}]"})]

## Including Vector Embeddings in results



In [14]:
objs = list(store.find({"id": "NeuronOfTheForebrain"}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
_, __, info = objs[0]
info["_embeddings"][0:20]

[-0.00770607078447938,
 0.009014262817800045,
 -0.0028122728690505028,
 0.014608148485422134,
 -0.014907942153513432,
 0.004258438479155302,
 0.007985424250364304,
 0.0027458411641418934,
 -0.011058313772082329,
 -0.018559979274868965,
 -0.0027458411641418934,
 0.024256067350506783,
 0.0012409089831635356,
 0.0037303923163563013,
 0.006833942607045174,
 0.009804628789424896,
 0.036438606679439545,
 -0.0029638733249157667,
 -0.00591411953791976,
 -0.00024549898807890713]

## Lookup by ID

Assumes that there is an identifier column

In [6]:
store.lookup("NeuronOfTheForebrain", collection="ont_cl")

{'id': 'NeuronOfTheForebrain',
 'label': 'neuron of the forebrain',
 'definition': 'A CNS neuron of the forebrain.',
 'aliases': None,
 'relationships': [{'predicate': 'HasSomaLocation', 'target': 'Forebrain'},
  {'predicate': 'subClassOf', 'target': 'CNSNeuron_sensuVertebrata_'},
  {'predicate': 'subClassOf', 'target': 'CentralNervousSystemNeuron'}],
 'logical_definition': [{'predicate': 'rdfs:subClassOf',
   'target': 'CNSNeuron_sensuVertebrata_'},
  {'predicate': 'HasSomaLocation', 'target': 'Forebrain'}],
 'original_id': 'CL:0012001'}

## All by All

In [15]:
objs = list(store.find({}, collection="ont_cl", include=["metadatas", "documents", "embeddings"]))
len(objs)

16121

In [32]:
import numpy as np

def compute_cosine_similarity(list1, list2):
    # Convert lists to numpy arrays
    matrix1 = np.array(list1)
    matrix2 = np.array(list2)
    
    # Normalize the vectors in both matrices
    matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
    matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
    
    # Compute dot products (resulting in cosine similarity values)
    cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
    
    return cosine_similarity_matrix

# Sample lists of vectors
list1 = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
list2 = [[7, 8, 9], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

cosine_similarity_matrix = compute_cosine_similarity(list1, list2)
cosine_similarity_matrix


array([[0.95941195, 0.26726124, 0.53452248, 0.80178373],
       [0.99819089, 0.45584231, 0.56980288, 0.68376346],
       [1.        , 0.50257071, 0.57436653, 0.64616234]])

In [33]:
cosine_similarity_matrix[2][0]

1.0

In [17]:
vectors = [info["_embeddings"] for _, __, info in objs]

In [18]:
cosine_similarity_matrix = compute_cosine_similarity(vectors, vectors)

In [19]:
len(cosine_similarity_matrix)

16121

In [20]:
def top_matches(cosine_similarity_matrix):
    # Find the index of the maximum value in each row
    top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
    
    # Find the maximum similarity value in each row
    top_match_values = np.amax(cosine_similarity_matrix, axis=1)
    
    return top_match_indices, top_match_values

In [34]:
list1 = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
list2 = [[0.5, 0.5, 0.5], [1, 0, 0], [0, 1, 0], [0, 0, 1]]

# Re-compute the cosine similarity matrix
test_matrix = compute_cosine_similarity(list1, list2)

# Find the top matches
top_indices, top_values = top_matches(test_matrix)
top_indices, top_values

(array([3, 2, 1]), array([1., 1., 1.]))

In [22]:
tm_ix, tm_vals = top_matches(cosine_similarity_matrix)
len(tm_ix)

16121

In [24]:
tm_ix[0:5]

array([0, 1, 2, 3, 4])

In [25]:
tm_vals[0:5]

array([1., 1., 1., 1., 1.])

## Latency Check

In [74]:
store = get_store("chromadb", "../../db")
client = store.client

In [75]:
cxn = client.get_collection("ont_ecosim")

In [76]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])

In [77]:
len(x["embeddings"])

1690

In [78]:
len(x["embeddings"][0])

1536

In [69]:
x = cxn.get(where={}, include=['metadatas', 'documents', 'embeddings'])