Instantantiate your embeddings model

In [None]:
model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_kwargs = {'device': 'cpu', 'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
### NOTE dim size is 1024 NOT 768 ########
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

Load your saved vectordb

In [None]:
persist_directory = 'fulldb_cliuserguide'
!unzip "/content/drive/MyDrive/Colab Notebooks/My NN stuff/alteon/chroma_db_cliuserguide.zip"
vectordb_cliguide = Chroma(persist_directory=persist_directory, embedding_function=hf)
print(vectordb_cliguide._collection.count())

Convert to pandas for easier processing

In [None]:

top = vectordb_cliguide.get(include=['uris','documents','metadatas', 'embeddings'])
newdict = {}
newdict['ids'] = top['ids']
newdict['metadatas'] = top['metadatas']
newdict['docs'] = top['documents']
newdict['embedding']= top['embeddings']

df = pd.DataFrame(newdict)
df = df.rename(columns={"docs":"name"}, errors="raise")
df['id'] = ["cliguide_chunks/"+str(i) for i in range(1, len(df) + 1)]
print (df.columns.to_list()) # ['id', 'metadatas', 'docs', 'embedding']
print (df.head(3))
print (df.tail(3))




save to json file on disk for later use

In [None]:
jsonl_string = df[["id","name","embedding"]].to_json(orient="records", lines=True)
with open("cliguideembeddings.json", "w") as f:
    f.write(jsonl_string)

**Upload chunks for retrieval to GCS**

- GCS files names for the chunks should be same as id for its embeddings

for examples:
  
                                           embedding                 id  
0  [0.07530160993337631, -0.38302767276763916, 0....  cliguide_chunks/1  
1  [-0.7277048826217651, -0.15646065771579742, 0....  cliguide_chunks/2  
2  [-0.2633463442325592, -0.06908517330884933, 0....  cliguide_chunks/3  

*Use the filename of chunk for cliguide_chunks/1  as cliguide_chunks/1 *

In [None]:

from google.cloud import storage


def upload_blob_from_memory(contents, bucket_name ="yourbucket", destination_blob_name = 'cliguide_chunks/1'):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(contents)


##### uploads the text chunks by creating file name corresponding to the id######

newdictt =  df[["id","name"]].to_dict()
for i in range(len(newdictt['name'])):
  upload_blob_from_memory(newdictt['name'][i],destination_blob_name = 'cliguide_chunks/'+str(i+1))

upload embedding json file to gcs

In [None]:
!gcloud config set project {PROJECT_ID}
!gsutil cp cliguideembeddings.json {GCS_BUCKET_URI}

Create vector search index and endpoint

In [None]:
from google.cloud import aiplatform
DIMENSIONS = 1024  # Dimensions
index_id = "index_dims1024_stream"
index_desc = "index_dims1024_stream"
index_stream = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=index_id,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    leaf_node_embedding_count=500, # The default value is 1000
    leaf_nodes_to_search_percent=7, #The default value is 10 (means 10%)
    description=index_desc,
    index_update_method="stream_update",
    distance_measure_type="DOT_PRODUCT_DISTANCE"
)

# once you run this cell index id is returned
## projects/930596367380/locations/us-central1/indexes/5647386389373779968

Setup endpoint

The endpoint creation might take more than an hour so you can can cancel the cell

In [None]:
index_endpoint_2 = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"RAG_endpoint_2",
    description="RAG endpoint 2",
    public_endpoint_enabled=True
)

my_index_id_2 = "5647386389373779968" # your index id

index = aiplatform.MatchingEngineIndex(
    index_name=my_index_id_2

    )
index_endpoint_2 = index_endpoint_2.deploy_index(
    index=index, deployed_index_id="index_dims1024_stream"
)

**Once endpoint is ready , go to edit index and upload the json file "cliguideembeddings.json"**