In [2]:
from chromadb import PersistentClient #TODO: check if can run on Linux and/or windows --> it can
from sentence_transformers import SentenceTransformer

client = PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection("go_terms_sorted")

model = SentenceTransformer("all-MiniLM-L6-v2")

## Load and embed GO terms

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load GO terms
df = pd.read_csv("data/go_terms.csv")  # columns: GO, Term_Description, Genes

# Sort gene lists alphabetically
def sort_genes(gene_str):
    genes = gene_str.split()
    return " ".join(sorted(genes))

df["Sorted_Genes"] = df["Genes"].apply(sort_genes)

# Format text for embedding (now uses sorted genes)
df["text"] = df.apply(
    lambda row: f"{row['GO']}: {row['Term_Description']} | Genes: {row['Sorted_Genes']}",
    axis=1
)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed texts
df["embedding"] = df["text"].apply(lambda x: model.encode(x).tolist())

df.head(2)


Unnamed: 0.1,Unnamed: 0,GO,Genes,Gene_Count,Term_Description,Sorted_Genes,text,embedding
0,0,GO:0006311,TEX11 PRDM9 RNF212,3,meiotic gene conversion,PRDM9 RNF212 TEX11,GO:0006311: meiotic gene conversion | Genes: P...,"[-0.06553520262241364, 0.007277282886207104, -..."
1,1,GO:0006855,ABCB11 ABCC4 ABCC3 SLC29A2 SLC37A3 SLC19A1 SLC...,16,xenobiotic transmembrane transport,ABCA3 ABCB11 ABCC2 ABCC3 ABCC4 ABCC5 ATP8B1 AT...,GO:0006855: xenobiotic transmembrane transport...,"[-0.06345071643590927, -0.0534319169819355, -0..."


In [4]:
# Upload only if collection is empty
if collection.count() == 0:
    print("Uploading GO terms to Chroma...")
    collection.add(
        documents=df["text"].tolist(),
        embeddings=df["embedding"].tolist(),
        ids=df["GO"].astype(str).tolist()
    )
else:
    print(f"{collection.count()} items already exist in Chroma.")

Uploading GO terms to Chroma...


In [13]:
# Define gene list (you can later make this dynamic)
gene_list = ['ABCB11', 'ABCC4', 'ABCC3']
# TODO: look into effects by ordering genes prior to search
query = "Tell me everything about" + str(', '.join(gene_list))

# Embed query
query_embedding = model.encode(query).tolist()

# Search Chroma for top-K relevant GO entries
results = collection.query(query_embeddings=[query_embedding], n_results=5)

# Print the retrieved GO context
top_context = "\n".join(results['documents'][0])
print("Top GO context:\n", top_context)

# TODO: LookupError: unknown encoding: Tell me everything about ARCN1

LookupError: unknown encoding: Tell me everything aboutABCB11, ABCC4, ABCC3

In [6]:
import sys
sys.path.append("utils") 

from openai_query import openai_chat
from prompt_factory import make_user_prompt_with_score

In [10]:
# Context and prompt setup
context = f"""You are an efficient and insightful assistant to a molecular biologist.
You should give the true answer that is supported by the references. If you do not have a clear answer, you will respond with "Unknown".

Important context for these genes can be found here:
{top_context}
# """
# 
# prompt = f"""The following GO terms describe gene functions:
# 
# {top_context}
# 
# Given the gene list: TP53, BAX, CASP3, what biological process do they most likely share?
# """
gene_list = ['ARCN1'] # "TP53", "BAX", "CASP3"
prompt = make_user_prompt_with_score(genes=gene_list)

# Query params
model = "gpt-3.5-turbo"
temperature = 0 #TODO: test different temperature parameters (0.1, 0.2, 0.3, 0.5, 0.7, 0.9)
max_tokens = 500
rate_per_token = 0.0005
LOG_FILE = "logs/test_openai_log.json"
DOLLAR_LIMIT = 1.00


In [11]:
response_text, fingerprint = openai_chat(
    context=context,
    prompt=prompt,
    model=model,
    temperature=temperature,
    max_tokens=max_tokens,
    rate_per_token=rate_per_token,
    LOG_FILE=LOG_FILE,
    DOLLAR_LIMIT=40.0,
    seed=42
)

print("GPT Response:\n", response_text)

1423
GPT Response:
 Process: Cerebellar development and interleukin-1-mediated signaling (0.85)

1. ARCN1 is involved in cerebellar cortex and Purkinje cell layer maturation, as indicated by its association with genes like CEND1 and RERE. These processes are crucial for the proper development and function of the cerebellum.

2. Additionally, ARCN1 is part of the interleukin-1-mediated signaling pathway, interacting with various genes such as IL1B, IL1R1, and IL6. This pathway plays a significant role in immune response regulation and inflammation.

In summary, the proteins in this system play a dual role in cerebellar development and interleukin-1-mediated signaling, indicating their involvement in both neurodevelopmental processes and immune response modulation. The confidence score reflects the significant contribution of these proteins to the identified processes.


In [None]:
# TODO: incorporate into existing pipeline
# TODO: test results between RAG system and traditional workflow