In [9]:
## Imports
from huggingface_hub import hf_hub_download
from llama_cpp import Llama


In [10]:
## Download the GGUF model
model_name = "LoneStriker/BioMistral-7B-DARE-GGUF"
model_file = "BioMistral-7B-DARE-Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
model_path = hf_hub_download(model_name, filename=model_file, local_dir="/mnt/extproj/projekte/regulatory_networks/llm_addon/model/")


BioMistral-7B-DARE-Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

In [11]:
# GLOBAL VARIABLES
my_model_path = "./model/BioMistral-7B-DARE-Q4_K_M.gguf"
CONTEXT_SIZE = 512


# LOAD THE MODEL
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context length to use
    n_threads=16,            # Number of CPU threads to use
    n_gpu_layers=0,        # Number of model layers to offload to GPU
)


llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /mnt/extproj/projekte/regulatory_networks/llm_addon/model/BioMistral-7B-DARE-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - k

In [35]:
## Generation kwargs
generation_kwargs = {
    "max_tokens":-1,
    "stop":["</s>"],
    "echo":False, # Echo the prompt in the output
    "top_k":1, # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
    "temperature": 0.9
}

## Run inference
geneList = ("blood", ["IFI27", "IFITM1", "IFITM3", "IFIT2", "IFIT1", "MT2A", "IFI6", "SIGLEC1", "IFIT3", "RSAD2", "ISG15", "LY6E", "IFI44L", "MX1"])
#geneList = ("kidney", ['NOG', 'LHCGR', 'DIO2', 'NPR1', 'SLC8A1', 'NR2F1', 'EGFL7', 'JARID2', 'HJV', 'PLOD1', 'SHOX2', 'ADRB3', 'TBX3', 'ATP11B', 'COX6C', 'BMPR1A', 'APOC3', 'CALR', 'ATF2', 'TPO', 'FGF10', 'NCOR2', 'RBM19', 'LMBR1L', 'TRMT1', 'FOXH1', 'ELSPBP1', 'TBX18', 'ME3', 'RGMB', 'LEMD3', 'ACOT11', 'PCP2', 'TBX6', 'TBX20', 'RGMA', 'ANK2', 'BMP5', 'GATA5', 'ZFYVE16', 'SCGB1A1', 'SMAD9', 'SLC47A1', 'NOS3', 'HIPK1', 'SUMO1', 'DRAP1', 'BMP2', 'BMPR2', 'CHRDL1', 'TBX1', 'LYL1', 'NKX2-5', 'APOA2', 'GJA5', 'APOA4', 'SLC5A5', 'HMGA2', 'ACTA2', 'HIPK2', 'DHX9', 'CHRD', 'MOV10L1', 'ACVRL1', 'PKD1', 'HFE', 'TBX5', 'CCDC89', 'SERPINB3', 'PRKD3', 'SIRT1', 'PLA2G1B', 'ETV2', 'mir-513a', 'SRA1', 'mir-320a', 'mir-1', 'mir-214', 'mir-223', 'mir-185', 'mir-125b', 'mir-155', 'mir-137', 'mir-206', 'mir-4271', 'mir-122', 'mir-647', 'mir-335', 'mir-543', 'mir-19b', 'mir-92a', 'mir-106a', 'mir-96', 'mir-143', 'mir-145', 'mir-17', 'mir-18a', 'mir-19a', 'mir-20a', 'mir-21', 'mir-141', 'mir-181a', 'mir-27a', 'mir-200a', 'mir-200b'])

genePrompt = "The following genes are dysregulated in {}: {}. How are these genes connected and which molecular functions are altered?".format(geneList[0],", ".join(geneList[1]))

prompt = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {}

Do not repeat functions of single genes. Highlight genes for which a clinical trial is running or has been completed.

Only return the helpful answer. Answer must be concise, detailed and well explained.
Helpful answer:
""".format(genePrompt)

print(prompt)

res = llm(prompt, **generation_kwargs) # Res is a dictionary

## Unpack and the generated text from the LLM response dictionary and print it
print(res["choices"][0]["text"])

Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: The following genes are dysregulated in blood: IFI27, IFITM1, IFITM3, IFIT2, IFIT1, MT2A, IFI6, SIGLEC1, IFIT3, RSAD2, ISG15, LY6E, IFI44L, MX1. How are these genes connected and which molecular functions are altered?

Do not repeat functions of single genes. Highlight genes for which a clinical trial is running or has been completed.

Only return the helpful answer. Answer must be concise, detailed and well explained.
Helpful answer:



Llama.generate: prefix-match hit

llama_print_timings:        load time =     548.95 ms
llama_print_timings:      sample time =      49.59 ms /   200 runs   (    0.25 ms per token,  4033.32 tokens per second)
llama_print_timings: prompt eval time =    7196.89 ms /    42 tokens (  171.35 ms per token,     5.84 tokens per second)
llama_print_timings:        eval time =   14513.26 ms /   199 runs   (   72.93 ms per token,    13.71 tokens per second)
llama_print_timings:       total time =   21882.43 ms /   241 tokens


The genes you mentioned are all involved in the innate immune response to viral infection. They encode proteins that are part of the interferon (IFN) signaling pathway, which is a key component of the innate immune system's defense against viruses. The IFN signaling pathway involves the production and secretion of type I and III IFNs by infected cells, which then bind to receptors on neighboring cells and activate downstream signaling cascades that lead to the expression of genes involved in antiviral immunity. The genes you listed encode proteins that are either directly involved in this pathway (e.g., IFITM1, IFITM3, IFIT2, IFIT1, IFI6, ISG15, LY6E, IFI44L, MX1) or indirectly related to it (e.g., MT2A, RSAD2).


In [38]:
## Generation kwargs
generation_kwargs = {
    "max_tokens":-1,
    "stop":["</s>"],
    "echo":False, # Echo the prompt in the output
    "top_k":1, # This is essentially greedy decoding, since the model will always return the highest-probability token. Set this value > 1 for sampling decoding
    "temperature": 0.9
}

## Run inference
geneList = ("blood", ["IFI27", "IFITM1", "IFITM3", "IFIT2", "IFIT1", "MT2A", "IFI6", "SIGLEC1", "IFIT3", "RSAD2", "ISG15", "LY6E", "IFI44L", "MX1"])
#geneList = ("kidney", ['NOG', 'LHCGR', 'DIO2', 'NPR1', 'SLC8A1', 'NR2F1', 'EGFL7', 'JARID2', 'HJV', 'PLOD1', 'SHOX2', 'ADRB3', 'TBX3', 'ATP11B', 'COX6C', 'BMPR1A', 'APOC3', 'CALR', 'ATF2', 'TPO', 'FGF10', 'NCOR2', 'RBM19', 'LMBR1L', 'TRMT1', 'FOXH1', 'ELSPBP1', 'TBX18', 'ME3', 'RGMB', 'LEMD3', 'ACOT11', 'PCP2', 'TBX6', 'TBX20', 'RGMA', 'ANK2', 'BMP5', 'GATA5', 'ZFYVE16', 'SCGB1A1', 'SMAD9', 'SLC47A1', 'NOS3', 'HIPK1', 'SUMO1', 'DRAP1', 'BMP2', 'BMPR2', 'CHRDL1', 'TBX1', 'LYL1', 'NKX2-5', 'APOA2', 'GJA5', 'APOA4', 'SLC5A5', 'HMGA2', 'ACTA2', 'HIPK2', 'DHX9', 'CHRD', 'MOV10L1', 'ACVRL1', 'PKD1', 'HFE', 'TBX5', 'CCDC89', 'SERPINB3', 'PRKD3', 'SIRT1', 'PLA2G1B', 'ETV2', 'mir-513a', 'SRA1', 'mir-320a', 'mir-1', 'mir-214', 'mir-223', 'mir-185', 'mir-125b', 'mir-155', 'mir-137', 'mir-206', 'mir-4271', 'mir-122', 'mir-647', 'mir-335', 'mir-543', 'mir-19b', 'mir-92a', 'mir-106a', 'mir-96', 'mir-143', 'mir-145', 'mir-17', 'mir-18a', 'mir-19a', 'mir-20a', 'mir-21', 'mir-141', 'mir-181a', 'mir-27a', 'mir-200a', 'mir-200b'])

genePrompt = "In which diseases does Slfnlnc play a role? What is its mechanism?"

prompt = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {}

Do not repeat functions of single genes. Highlight genes for which a clinical trial is running or has been completed.

Only return the helpful answer. Answer must be detailed and well explained.
Helpful answer:
""".format(genePrompt)

print(prompt)

res = llm(prompt, **generation_kwargs) # Res is a dictionary

## Unpack and the generated text from the LLM response dictionary and print it
print(res["choices"][0]["text"])

Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: In which diseases does Slfnlnc play a role? What is its mechanism?

Do not repeat functions of single genes. Highlight genes for which a clinical trial is running or has been completed.

Only return the helpful answer. Answer must be detailed and well explained.
Helpful answer:



Llama.generate: prefix-match hit

llama_print_timings:        load time =     548.95 ms
llama_print_timings:      sample time =      70.31 ms /   278 runs   (    0.25 ms per token,  3954.03 tokens per second)
llama_print_timings: prompt eval time =    7571.03 ms /    60 tokens (  126.18 ms per token,     7.92 tokens per second)
llama_print_timings:        eval time =   20410.28 ms /   277 runs   (   73.68 ms per token,    13.57 tokens per second)
llama_print_timings:       total time =   28236.75 ms /   337 tokens


Slfnlnc is a long non-coding RNA (lncRNA) that plays a role in several diseases, including cancer, cardiovascular disease, and neurological disorders. In cancer, Slfnlnc has been shown to promote cell proliferation, migration, and invasion in various types of cancers, such as breast cancer, lung cancer, and colorectal cancer. Its mechanism involves the regulation of several genes involved in these processes, including cyclin D1, matrix metalloproteinase 2 (MMP-2), and vascular endothelial growth factor A (VEGF-A). In cardiovascular disease, Slfnlnc has been shown to regulate the expression of genes involved in atherosclerosis, such as interleukin-6 (IL-6) and tumor necrosis factor alpha (TNF-α), leading to the development of plaques. In neurological disorders, Slfnlnc has been shown to regulate the expression of genes involved in Alzheimer's disease, such as amyloid precursor protein (APP) and presenilin 1 (PSEN1). Its mechanism involves the regulation of several signaling pathways, in