# Creating the database

In [1]:
from util.run_go_term_embedding_api import load_output_jsonl
import json

import numpy as np
import pandas as pd

import pandas as pd
from pymilvus import (
    connections,
    FieldSchema, CollectionSchema, DataType,
    Collection
)

# Connect to Milvus (assumes Milvus is running on localhost at port 19530)
connections.connect(alias="default", host="localhost", port="19530")

# Define schema for the embeddings collection using string IDs.
id_field = FieldSchema(
    name="id", 
    dtype=DataType.VARCHAR, 
    max_length=150,
    is_primary=True, 
    auto_id=False, 
    description="Unique identifier for each node (string)"
)
embedding_field = FieldSchema(
    name="embedding", 
    dtype=DataType.FLOAT_VECTOR, 
    dim=1536,
    description="High-dimensional vector for the node"
)
embeddings_schema = CollectionSchema(
    fields=[id_field, embedding_field], 
    description="Collection to store node embeddings"
)

# Create the embeddings collection
embeddings_collection = Collection(name="embeddings_collection", schema=embeddings_schema)
print("Created collection:", embeddings_collection.name)


for file_number in range(0, 32):
    print(f"Processing chunk {file_number}")
    file_df = pd.read_parquet(f"data/parquet_chunks/chunk_{file_number}.parquet")
    file_df = file_df.rename(columns={"node_id": "id"})
    # remove rows where id is empty
    
    # Insert the data into Milvus in batches of 1000
    chunk_size = 10000
    for start in range(0, len(file_df), chunk_size):
        chunk_df = file_df.iloc[start:start + chunk_size]
        # You can now process each chunk as needed
        #remove rows where id is empty
        chunk_df = chunk_df[chunk_df["id"].notna()]
        #filter out rows where id is longer than 100 characters
        chunk_df = chunk_df[chunk_df["id"].str.len() <= 100]
        
        print("max id length", chunk_df["id"].str.len().max())
        

        data_embeddings = [

            chunk_df["id"].str[:].tolist(),          # string IDs
            chunk_df["embedding"].tolist()
        ]
        insert_result = embeddings_collection.insert(data_embeddings)

    # now flush the collection
    embeddings_collection.flush()
    
print("creating index")
index_params = {
    "index_type": "IVF_FLAT",  # Choose the appropriate index type
    "metric_type": "L2",       # Choose the appropriate metric
    "params": {"nlist": 128}   # Adjust 'nlist' as needed
}
embeddings_collection.create_index(field_name="embedding", index_params=index_params)

#flush the collection
embeddings_collection.flush()




MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>

In [51]:
embeddings_collection.flush()

In [52]:
embeddings_collection.load()

In [None]:
print("creating index")
index_params = {
    "index_type": "IVF_FLAT",  # Choose the appropriate index type
    "metric_type": "COSINE",       # Choose the appropriate metric
    "params": {"nlist": 128}   # Adjust 'nlist' as needed
}

embeddings_collection.create_index(field_name="embedding", index_params=index_params)

#flush the collection
embeddings_collection.flush()

In [53]:

# Search for the following embedding in the database:
embeddings_collection.load()
query_embedding = classes_df.loc[0, "embedding"]

search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = embeddings_collection.search(
    data=[query_embedding],
    anns_field="embedding",
    param=search_params,
    limit=5,
    output_fields=["id"]
)

# Results
print("Search Results:", results)

Search Results: data: ['["id: nuclear chromosome [cellular c, distance: 0.17091326415538788, entity: {\'id\': \'nuclear chromosome [cellular c\'}", "id: nuclear chromosomes [subcellul, distance: 0.21435533463954926, entity: {\'id\': \'nuclear chromosomes [subcellul\'}", "id: nuclear chromosomes [organ], distance: 0.22352105379104614, entity: {\'id\': \'nuclear chromosomes [organ]\'}", "id: nuclear chromosomes [genetic m, distance: 0.22971370816230774, entity: {\'id\': \'nuclear chromosomes [genetic m\'}", "id: Nuclear genome [organelle], distance: 0.23080316185951233, entity: {\'id\': \'Nuclear genome [organelle]\'}"]']


# loading the data 1 by 1

In [36]:
from util.TAIR_embedding_dataloader import go_df, unique_slim_terms
# Read the file, skipping the header comments that start with '!'
tair_df = pd.read_csv('data/TAIR/TAIR_GO_slim_categories.txt', 
                 sep='\t',                    # Tab-separated
                 comment='!',                 # Skip lines starting with !
                 skip_blank_lines=True,       # Skip any blank lines
                 encoding='utf-8',            # Specify encoding
                 names=['ONTOLOGY ASPECT', 'GO TERM', 'GO_ID', 'DEFINITION', 'SLIM_NAME'],  # Exact column names
)
# Clean up any whitespace in the data
tair_df = tair_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

set_go_df = set(unique_slim_terms)
set_tair_df = set(tair_df["GO_ID"].unique())
#go_df = go_df[go_df.index.isin(set_tair_df)]
print("TAIR GO IDs:", len(set_tair_df))
print("GO IDs:", len(set_go_df))

#find the difference between the two sets
difference = set_go_df - set_tair_df
print("Difference:", len(difference))


TAIR GO IDs: 97
GO IDs: 141
Difference: 106


In [37]:
go_df

Unnamed: 0,name,namespace,def,is_obsolete,alt_ids,parents,children,slim_go_id
GO:0000001,mitochondrion inheritance,biological_process,"The distribution of mitochondria, including th...",False,[],"[GO:0000001, GO:0048308, GO:0048311]",[GO:0000001],[{}]
GO:0000002,mitochondrial genome maintenance,biological_process,The maintenance of the structure and integrity...,False,[],"[GO:0000002, GO:0007005]",[GO:0000002],[{GO:0007005}]
GO:0000006,high-affinity zinc transmembrane transporter a...,molecular_function,Enables the transfer of zinc ions (Zn2+) from ...,False,[],"[GO:0000006, GO:0005385]",[GO:0000006],[{GO:0005215}]
GO:0000007,low-affinity zinc ion transmembrane transporte...,molecular_function,Enables the transfer of a solute or solutes fr...,False,[],"[GO:0000007, GO:0005385]",[GO:0000007],[{GO:0005215}]
GO:0000009,"alpha-1,6-mannosyltransferase activity",molecular_function,Catalysis of the transfer of a mannose residue...,False,[],"[GO:0000009, GO:0000030]","[GO:0000009, GO:0033164, GO:0052917]","[{GO:0016740, GO:0003824}]"
...,...,...,...,...,...,...,...,...
GO:1904143,positive regulation of carotenoid biosynthetic...,biological_process,Any process that activates or increases the fr...,False,[],"[GO:1904143, GO:0045828, GO:0046889]",[GO:1904143],[{}]
GO:1904144,phosphatidylinositol phosphate phosphatase com...,cellular_component,A protein complex which is capable of phosphat...,False,[],"[GO:1904144, GO:1903293]","[GO:1904144, GO:1990455]",[{}]
GO:1904145,negative regulation of meiotic cell cycle proc...,biological_process,"Any process that stops, prevents or reduces th...",False,[],"[GO:1904145, GO:0010948, GO:0051447, GO:190019...",[GO:1904145],[{}]
GO:1904146,positive regulation of meiotic cell cycle proc...,biological_process,Any process that activates or increases the fr...,False,[],"[GO:1904146, GO:0051446, GO:0090068, GO:1903538]",[GO:1904146],[{}]


In [63]:
query = classes_df.loc[52,"id"]
query_info = go_df.loc[query]
query_info

name                                               cell adhesion
namespace                                     biological_process
def            The attachment of a cell, either to another ce...
is_obsolete                                                False
alt_ids                                             [GO:0098602]
parents                                 [GO:0007155, GO:0009987]
children       [GO:0007155, GO:0003392, GO:0031589, GO:003362...
slim_go_id                                        [{GO:0007155}]
Name: GO:0007155, dtype: object

In [None]:
query_embedding = classes_df.loc[52,"embedding"]
search_params = {"metric_type": "L2", "params": {"nprobe": 20}}
results = embeddings_collection.search(
    data=[query_embedding],
    anns_field="embedding",
    param=search_params,
    limit=5,
    #output_fields=["id"]
)

# Results
print("Search Results:", results)



Search Results: data: ["['id: Cell-cell adhesion [process], distance: 0.1471722573041916, entity: {}', 'id: cellular adhesion [process], distance: 0.16066783666610718, entity: {}', 'id: cell-adhesion [phenotype], distance: 0.16897818446159363, entity: {}', 'id: cell adhesiveness [process], distance: 0.17458701133728027, entity: {}', 'id: Biological adhesion [process], distance: 0.174787700176239, entity: {}']"]


# Searching

# Working with the database

In [None]:
# Get collection statistics
stats = embeddings_collection.num_entities
print(f"Number of entities in the collection: {stats}")


Number of entities in the collection: 3128221


# Closing

In [83]:

# #close milvus and start over
embeddings_collection.drop()
connections.disconnect(alias="default")


# shutting down 

In [79]:
#remember to flush the collection
embeddings_collection.flush()
connections.disconnect(alias="default")



Then run following command from right folder to shut down docker image:

    bash standalone_embed.sh stop  

# Reopen

    bash standalone_embed.sh start 

In [76]:
#reopen milvus
connections.connect(alias="default", host="localhost", port="19530")
#reload embeddings_collection
embeddings_collection = Collection("embeddings_collection")
embeddings_collection.load()

In [78]:
embeddings_collection.flush()

In [73]:
query_embedding = classes_df.loc[52,"embedding"]
search_params = {"metric_type": "L2", "params": {"nprobe": 20}}
results = embeddings_collection.search(
    data=[query_embedding],
    anns_field="embedding",
    param=search_params,
    limit=5,
    #output_fields=["id"]
)

# Results
print("Search Results:", results)


Search Results: data: ["['id: Cell-cell adhesion [process], distance: 0.1471722573041916, entity: {}', 'id: cellular adhesion [process], distance: 0.16066783666610718, entity: {}', 'id: cell-adhesion [phenotype], distance: 0.16897818446159363, entity: {}', 'id: cell adhesiveness [process], distance: 0.17458701133728027, entity: {}', 'id: Biological adhesion [process], distance: 0.174787700176239, entity: {}']"]


In [70]:
connections.list_connections()

[('default', <pymilvus.client.grpc_handler.GrpcHandler at 0x349ec66c0>)]

In [4]:
for file_df in pd.read_parquet("data/KG/all_merged_node_embeddings_100000.parquet", chunksize=chunk_size):
    # Rename the column from 'node_id' to 'id'
    file_df = file_df.rename(columns={"node_id": "id"})
    break

TypeError: read_table() got an unexpected keyword argument 'chunksize'