In [6]:
# !pip install openai langchain faiss-cpu pandas numpy
# !pip install requests

In [7]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests


base_url = 'https://apps.humanatlas.io/api/v1'

response = requests.get(f'{base_url}/biomarker-tree-model')
data = response.json()
# Load JSON
#with open(biomarker_terms, "r") as file:
    #data = json.load(file)


# Parse JSON to Extract Relevant Data
def parse_nodes(json_data):
    """Extracts relevant fields from the 'nodes' dictionary."""
    nodes = json_data.get("nodes", {})
    parsed_data = []
    for node_id, node_details in nodes.items():
        label = node_details.get("label", "")
        parent = node_details.get("parent", "")
        children = ", ".join(node_details.get("children", []))
        synonyms = ", ".join(node_details.get("synonymLabels", []))
        parsed_data.append({
            "id": node_id,
            "label": label,
            "parent": parent,
            "children": children,
            "synonyms": synonyms
        })
    return pd.DataFrame(parsed_data)

  from tqdm.autonotebook import tqdm, trange


In [8]:
# Flatten
df = parse_nodes(data)

# put into text
df['combined_text'] = df.apply(
    lambda row: f"Label: {row['label']}, Parent: {row['parent']}, "
                f"Children: {row['children']}, Synonyms: {row['synonyms']}",
    axis=1
)

# embeddigs
model = SentenceTransformer('all-MiniLM-L6-v2')
df['embeddings'] = df['combined_text'].apply(lambda text: model.encode(text))
embeddings = np.vstack(df['embeddings'].to_numpy())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# Create FAISS Index for Retrieval
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save Combined Text for Retrieval
text_data = df['combined_text'].to_list()

# Load Hugging Face LLM (GPT-Neo)
llm_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
llm_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [24]:
def retrieve_context(question, k=3): #getrelevant info from the json
    """Retrieve the most relevant context for a given question."""
    question_embedding = model.encode(question)
    distances, indices = index.search(np.array([question_embedding]), k)
    return [text_data[i] for i in indices[0]]

#make sure to include the json pulle dinof
def generate_response_with_context(question, context):
    """
    Generate a response using GPT-Neo that explicitly involves the context in the answer.
    """
    # prompt = (
    #     f"The following is some context from a document:\n{context}\n\n"
    #     f"Using the context above, answer the following question:\n{question}\n\n"
    #     f"Answer (use the context as part of the answer):"
    # )
    prompt = (
      f"The following is some context from a document:\n{context}. The document is a hierarchical structure of biomarkers.\n\n"
      f"You are a medical expert specializing in biomarker research. Using the context above, answer the following question:\n{question}\n\n"
      f"Answer (use the context as part of the answer):"
    )
    input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
    output = llm_model.generate(
        input_ids,
        max_length=400,
        num_return_sequences=1,
        temperature=0
        # temperature=0.7
    )
    return llm_tokenizer.decode(output[0], skip_special_tokens=True)

def generate_response_with_context_02(question, context):
    """
    Generate a response using GPT-Neo that explicitly involves the context in the answer.
    """
    prompt = (
      f"The following is some context from a document:\n{context}. The document is a hierarchical structure of biomarkers.\n\n"
      f"You are a medical expert specializing in biomarker research.\n\n"
      f"An example question is, 'What is ABCA1?'\n\n"
      f"Example answer: 'ABCA1 is a gene that is part of the biomarker structure with no children, synonymLabels and can be found at http://identifiers.org/hgnc/29.'\n\n"
      f"Using the context above, answer the following question:\n{question}\n\n"
      f"Answer (use the context as part of the answer):\n\n"
    )
    input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
    output = llm_model.generate(
        input_ids,
        max_length=400,
        num_return_sequences=1,
        temperature=0
        # temperature=0.7
    )
    return llm_tokenizer.decode(output[0], skip_special_tokens=True)


#Qa
def answer_question_with_context(question):
    """
    Answer a question by retrieving context and generating a context-based response.
    """
    retrieved_context = " ".join(retrieve_context(question))
    response = generate_response_with_context(question, retrieved_context)
    return response

def answer_question_with_context_02(question):
    """
    Answer a question by retrieving context and generating a context-based response.
    """
    retrieved_context = " ".join(retrieve_context(question))
    response = generate_response_with_context_02(question, retrieved_context)
    return response

In [25]:
# # Test the Pipeline
# question = "What can you tell me about CCL2?"
# print("prompt 1: ", answer_question_with_context(question))

# print("prompt 2: ", answer_question_with_context_02(question))

In [26]:
# # Test the Pipeline
# question = "What can you tell me about ACKR1?"
# print("prompt 1: ", answer_question_with_context(question))

# print("prompt 2: ", answer_question_with_context_02(question))

In [27]:
# question = "What is the C-C motif chemokine receptor?"
# print("prompt 1: ", answer_question_with_context(question))

# print("prompt 2: ", answer_question_with_context_02(question))

In [28]:
# question = "What is the chemokine CCL2?"
# print("prompt 1: ", answer_question_with_context(question))

# print("prompt 2: ", answer_question_with_context_02(question))

In [40]:
question = "What can you tell me about MECOM?"
print("prompt 1: ", answer_question_with_context(question))

print("prompt 2: ", answer_question_with_context_02(question))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


prompt 1:  The following is some context from a document:
Label: MECOM, Parent: gene, Children: , Synonyms:  Label: IGHM, Parent: protein, Children: , Synonyms: IgM Label: MSLN, Parent: protein, Children: , Synonyms: mesothelin. The document is a hierarchical structure of biomarkers.

You are a medical expert specializing in biomarker research. Using the context above, answer the following question:
What can you tell me about MECOM?

Answer (use the context as part of the answer):

MECOM is a protein that is produced by the lymphocytes of the immune system. It is a member of the immunoglobulin gene superfamily. It is a protein that is produced by the lymphocytes of the immune system. It is a member of the immunoglobulin gene superfamily. It is a protein that is produced by the lymphocytes of the immune system. It is a member of the immunoglobulin gene superfamily. It is a protein that is produced by the lymphocytes of the immune system. It is a member of the immunoglobulin gene superfa

In [42]:
question = "Tell me about MEIS1?"
print("prompt 1: ", answer_question_with_context(question))

print("prompt 2: ", answer_question_with_context_02(question))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


prompt 1:  The following is some context from a document:
Label: MEIS1, Parent: gene, Children: , Synonyms:  Label: MEIS2, Parent: gene, Children: , Synonyms:  Label: AZU1, Parent: gene, Children: , Synonyms: . The document is a hierarchical structure of biomarkers.

You are a medical expert specializing in biomarker research. Using the context above, answer the following question:
Tell me about MEIS1?

Answer (use the context as part of the answer):

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1 is a gene that is expressed in the brain.

MEIS1

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


prompt 2:  The following is some context from a document:
Label: MEIS1, Parent: gene, Children: , Synonyms:  Label: MEIS2, Parent: gene, Children: , Synonyms:  Label: AZU1, Parent: gene, Children: , Synonyms: . The document is a hierarchical structure of biomarkers.

You are a medical expert specializing in biomarker research.

An example question is, 'What is ABCA1?'

Example answer: 'ABCA1 is a gene that is part of the biomarker structure with no children, synonymLabels and can be found at http://identifiers.org/hgnc/29.'

Using the context above, answer the following question:
Tell me about MEIS1?

Answer (use the context as part of the answer):

  * MEIS1 is a gene that is part of the biomarker structure with no children, synonymLabels and can be found at http://identifiers.org/hgnc/29.
  * MEIS1 is a gene that is part of the biomarker structure with no children, synonymLabels and can be found at http://identifiers.org/hgnc/29.
  * MEIS1 is a gene that is part of the biomarker stru