In [5]:
# LLM dynamic evaluation

import replicate
import pandas as pd
import json
import os
from config import config
from dotenv import load_dotenv
load_dotenv()
import time
import random
folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME
config.set_mode("dynamic")

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def get_random_perturbation(perturbations):
    category = random.choice(list(perturbations.keys()))
    perturbation = random.choice(list(perturbations[category].items()))
    return category, perturbation

# # Load the file
df = pd.read_excel(config.questions)
# Save the original DataFrame
df.to_excel(config.q_original, index=False)

# Trim whitespace and newline characters
df['Question'] = df['Question'].str.strip()  # Removes leading/trailing whitespace

# Check for duplicate questions
duplicates = df.duplicated(subset=['Question'], keep=False)
if duplicates.any():
    print("Duplicates found. Removing duplicates.")

    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset=['Question'], keep='first')

    # Save the modified DataFrame, overwriting the original questions file
    df.to_excel(config.q_original, index=False)
else:
    print("No duplicates found.")

perturbations = load_file(config.perturbations)
knowledgebase = load_file(config.knowledgebase)

# Modify DataFrame to include new columns
results_df = pd.DataFrame(columns=['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response', 'Latency', 'Category', 'Type'])

models = {
    # "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    "falcon-40b": "joehoover/falcon-40b-instruct:7d58d6bddc53c23fa451c403b2b5373b1e0fa094e4e0d1b98c3d02931aa07173",
    "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    "llama2-70b": "meta/llama-2-70b-chat",
    "noushermes2": "nateraw/nous-hermes-2-solar-10.7b",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e",
    # "deepseek_33bq": "kcaverly/deepseek-coder-33b-instruct-gguf:ea964345066a8868e43aca432f314822660b72e29cab6b4b904b779014fe58fd",
    }

def generate_prompt(model_key, instruction, question):
    prompt_for_qwen = "system\n {instruction}. Please try your best to answer the following question. \nuser\n{question}\nassistant\n"
    prompt_for_hermes = """[
    {{
      "role": "system",
      "content": "{instruction}. Please try your best to answer the following question." 
    }},
    {{
      "role": "user",
      "content": {question}
    }}
    ]"""

    if model_key in ["yi-34b", "qwen-14b"]:
        return prompt_for_qwen.format(instruction=instruction, question=question)
    elif model_key == "noushermes2":
        return prompt_for_hermes.format(instruction=instruction, question=question)
    else:
        # plain_text_question = json.loads(question)
        return f"{instruction}. Please try your best to answer the following question. {question}"

prompt_for_qwen="""<|im_start|>system\n {INSTRUCTION}. Please try your best to answer the following question. <|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": "{INSTRUCTION}. Please try your best to answer the following question." 
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

def ask_llm(model_value, prompt):   
    output = replicate.run(
        model_value,
        input={
            "debug": False,
        #   "top_k": 50,
            "top_p": 0.9,
            "prompt": prompt,
            "temperature": 0.7,
            "max_new_tokens": 500,
            "min_new_tokens": -1
        }
    )
    response = ""
    for item in output:
        item_str = str(item)  # Convert item to string
        response += item_str if len(item_str) == 1 else f" {item_str}"
    response = response.strip()
    return response

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        start_time = time.time()  # Record the start time
        qn = row['Question']
        Category = row['Category']
        Type = row['Type']
        question = json.dumps(qn)
        prompt = generate_prompt(model_key, config.INSTRUCTION, question)
        try:
            print(f"{model_key}: {prompt}")
            response = ask_llm(model_value, prompt)
            
        except Exception as e:
            response = f"Error: {e}"

        # Perturb the question and get the response
        category, (perturbation, description) = get_random_perturbation(perturbations)
        perturbed_qn = f"{qn}\nResponse: {response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
        prompt = generate_prompt(model_key, config.INSTRUCTION, perturbed_qn)
        print(f"{model_key}: Perturbed Question: {prompt}")
        perturbed_response = ask_llm(model_value, perturbed_qn)

        # Evaluate sufficiency or suggest alternate course
        final_analysis_qn = f"Original Question: {question}\nOriginal Response: {response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase: {knowledgebase}\nNow consider the knowlegebase, what else ought we to do?"
        prompt = generate_prompt(model_key, config.INSTRUCTION, final_analysis_qn)
        print(f"{model_key}: Final Analysis Question: {prompt}")
        final_analysis_response = ask_llm(model_value, final_analysis_qn)

        end_time = time.time()  # Record the end time
        latency = (end_time - start_time)/3  # Calculate average latency

        # Record each stage
        new_row = {
            'Model': model_key, 
            'Question': qn, 
            'Response': response, 
            'Perturbed Question': perturbed_qn, 
            'Perturbed Response': perturbed_response, 
            'Final Analysis Question': final_analysis_qn, 
            'Final Analysis Response': final_analysis_response,
            'Latency': latency,
            'Category': Category,
            'Type': Type
        }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

        if index % 10 == 0:
            results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')
            
results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')

No duplicates found.
falcon-40b: You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following question. "Are pathogenicity scores of mutations or copy number profiles associated with cancer progression and survival for the gene POLR3E?"
falcon-40b: Perturbed Question: You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following question. Are pathogenicity scores of mutations or copy number profiles associated with cancer progression and survival for the gene POLR3E?
Response: As  an  AI  language  model,  I  do  not  have  the  ability  to  browse  the  latest  research  in  computational  biology  and  genomics.  However,  I  can  suggest  conducting  a  search  on  scientific  databases  and  publications  to  find  the  most  recent  and  relevant  information  on  the  topic.  Some  possible  sour

In [2]:
# GPT-4 dynamic evaluation

import pandas as pd
import json
from config import config
import openai
import requests
from openai import OpenAI
import time
from dotenv import load_dotenv
load_dotenv()
import os
import random

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME
config.set_mode("dynamic")

folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def get_random_perturbation(perturbations):
    category = random.choice(list(perturbations.keys()))
    perturbation = random.choice(list(perturbations[category].items()))
    return category, perturbation

perturbations = load_file(config.perturbations)
knowledgebase = load_file(config.knowledgebase)

client = OpenAI()
def show_json(obj):
    print(json.loads(obj.model_dump_json()))

assistant = client.beta.assistants.create(
    name=f"{F_NAME} AI Dynamic Evaluator",
    instructions=config.INSTRUCTION,
    model=config.GPT_MODEL,
)
show_json(assistant)

# Utility functions
def read_csv(file_path):
    return pd.read_excel(file_path)

def process_data_for_gpt(data):
    prompts = []
    for _, row in data.iterrows():
        question = row['Question']
        prompt = f"Please try your best to answer the following question.:\n\n{question}"
        prompts.append(prompt)
    return prompts

def submit_message_and_create_run(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=prompt)
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def wait_on_run_and_get_response(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(0.5)
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    return [m.content[0].text.value for m in messages if m.role == 'assistant']

def ask_gpt4(prompt, ASSISTANT_ID):
    run, thread = submit_message_and_create_run(ASSISTANT_ID, prompt)
    response = wait_on_run_and_get_response(run, thread)
    if isinstance(response, list):
        response = ' '.join(map(str, response))
    response = response.replace("\\\\n", "\\n")
    response = response.strip()
    print(response)
    return response

def process_question_with_gpt4(row, assistant_id):
    start_time = time.time()  # Capture start time
    original_question = row['Question']
    category = row.get('Category', 'Static')  # Default to 'Static' if not present
    if category != "Dynamic":
        # Dynamic question processing logic
        first_response = ask_gpt4(original_question, assistant_id)
        category, (perturbation, description) = get_random_perturbation(perturbations)
        perturbed_qn = f"{original_question}\nResponse: {first_response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
        perturbed_response = ask_gpt4(perturbed_qn, assistant_id)
        final_analysis_qn = f"Original Question: {original_question}\nOrig Response: {first_response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase Content: {knowledgebase}\n Now consider the knowlegebase, what else ought we to do?"
        final_analysis_response = ask_gpt4(final_analysis_qn, assistant_id)
    else:
        # Static question processing logic
        first_response = ask_gpt4(original_question, assistant_id)
        perturbed_qn = perturbed_response = final_analysis_qn = final_analysis_response = "n/a"

    end_time = time.time()  # Capture end time
    latency = (end_time - start_time)/3  # Calculate latency

    return {
        'Model': config.GPT_MODEL,
        'Question': original_question, 
        'Response': first_response, 
        'Perturbed Question': perturbed_qn, 
        'Perturbed Response': perturbed_response, 
        'Final Analysis Question': final_analysis_qn, 
        'Final Analysis Response': final_analysis_response,
        'Latency': latency,
        'Category': row['Category'],
        'Type': row['Type']
    }

# Modify DataFrame to include new columns
new_data_columns = ['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response', 'Latency', 'Category', 'Type']
results_df = pd.DataFrame(columns=new_data_columns)
data = read_csv(config.questions)
prompts = process_data_for_gpt(data)
ASSISTANT_ID = assistant.id

# Process each question
for index, row in data.iterrows():
    processed_info = process_question_with_gpt4(row, ASSISTANT_ID)
    results_df = results_df.append(processed_info, ignore_index=True)

# Save the results
results_df.to_excel(config.gpt4results_csv_path, index=False)


{'id': 'asst_0Q9aSkfKVn9J8hX5Z5MXJ5QX', 'created_at': 1706206109, 'description': None, 'file_ids': [], 'instructions': 'You are an exceptional computational biologist and genomics expert and know everything about drug discovery.', 'metadata': {}, 'model': 'gpt-3.5-turbo-1106', 'name': 'galen AI Dynamic Evaluator', 'object': 'assistant', 'tools': []}
The dependency of a specific gene, like POLR3E, can vary across different cell lines. To identify cell lines with high dependency on POLR3E, you can analyze data from large-scale CRISPR knockout or RNA interference (RNAi) screens, such as those from the Cancer Dependency Map (DepMap) project.

These databases provide information on gene essentiality across hundreds of cell lines, allowing you to identify cell lines with high dependency on POLR3E. You can use tools such as the DepMap portal to access these data and search for cell lines with significant dependency on POLR3E.

Consider consulting with bioinformatics specialists or computation

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes for one of the subunits of RNA polymerase III, which is involved in the transcription of various small noncoding RNAs. The expression and function of POLR3E may vary across different cell lines and tissues. While there is no direct information available on the dependent cell lines for POLR3E, its enriched expression can provide insights into potential indications and lineages with high selectivity.

To determine the indications and lineages with high selectivity for POLR3E, you may want to examine gene expression data from diverse cell lines and tissues using resources such as the Genotype-Tissue Expression (GTEx) project, Cancer Cell Line Encyclopedia (CCLE), Human Protein Atlas, and other publicly available databases.

By analyzing the expression profiles of POLR3E across different cell types and conditions, you can identify specific indications and lineages where this gene is significantly enriched. This information can be valuable for understanding the poten

  results_df = results_df.append(processed_info, ignore_index=True)


There is evidence demonstrating a concordance in the variant profile of cell lines and patient samples for the gene POLR3E. POLR3E encodes a subunit of RNA polymerase III, which is involved in the transcription of small non-coding RNAs. Variants in this gene have been associated with different diseases, including hypomyelinating leukodystrophy and other neurological disorders. Studies have shown that certain variants identified in patient samples are also present in corresponding cell line models, which supports their utility for investigating disease mechanisms and testing potential drug treatments. This concordance can be valuable for drug discovery efforts, as it allows for the validation of potential therapeutic targets and the screening of drug candidates using relevant cell line models.
In the context of reduced funding availability due to budget cuts, it may be necessary to prioritize the use of more cost-effective methods for studying the variant profile of cell lines and patie

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes a subunit of RNA polymerase III, which is involved in the transcription of various small non-coding RNAs. To study the associations between expression and variant profiles for the POLR3E gene, we can leverage various genomic and bioinformatic tools and databases.

1. Expression Profiles:
We can first analyze the expression profiles of POLR3E in different tissues and cell types using RNA sequencing data from large-scale projects such as the Genotype-Tissue Expression (GTEx) project or the Human Protein Atlas. This can help us understand the baseline expression levels of POLR3E and identify any tissue-specific expression patterns.

2. Variant Profiles:
Next, we can investigate the variant profiles of POLR3E by examining genetic variation data from population-level studies such as the 1000 Genomes Project or the Exome Aggregation Consortium (ExAC). This can involve identifying single nucleotide polymorphisms (SNPs), insertions/deletions, and other genetic variants 

  results_df = results_df.append(processed_info, ignore_index=True)


I'm sorry, but I currently do not have direct access to the latest experimental data available on NCBI GEO or other specific databases. However, I would suggest searching for the gene POLR3E in the NCBI GEO database website. You can use the gene symbol "POLR3E" as a keyword for searching, and you may find relevant studies that have used cell lines for the target of interest. If you need any assistance in interpreting the data or performing any specific analysis, feel free to ask for guidance.
Given the changes in the team structure, I can adapt the response to reflect the expertise and responsibilities of the new team. Here's an amended response:

As a computational biologist and genomics expert, I have access to extensive databases and tools for analyzing experimental data. I can search for studies related to the gene POLR3E in the NCBI GEO database and provide you with relevant links or analysis within the platform. If you have specific questions or require further assistance in inte

  results_df = results_df.append(processed_info, ignore_index=True)


The POLR3E gene encodes a subunit of RNA polymerase III, which is responsible for transcribing small non-coding RNAs. To identify indications with significant differential expression in cancer vs normal tissue for the POLR3E gene, we can utilize bioinformatics tools and databases such as TCGA, GTEx, Oncomine, and GEO to analyze gene expression data across different cancer types.

After conducting a thorough analysis, I have found that POLR3E is overexpressed in various types of cancer compared to normal tissue. Specifically, significant differential expression of POLR3E has been observed in multiple cancer types including breast cancer, lung cancer, ovarian cancer, and others.

As for indications associated with differential expression, POLR3E overexpression has been implicated in cancer progression, metastasis, and drug resistance. Additionally, studies have linked POLR3E overexpression with poor patient prognosis in certain cancer types.

It is important to note that gene expression 

  results_df = results_df.append(processed_info, ignore_index=True)


The variant profile of tissues with significant differential expression for the gene POLR3E typically involves diverse alterations such as copy number variations (CNVs), mutations, and other genetic changes. 

1. Copy Number Variations (CNVs): Tissues with differential expression of POLR3E may exhibit CNVs in the form of amplifications or deletions of the gene. These CNVs can lead to changes in the expression levels of POLR3E. For instance, amplifications can result in increased expression, while deletions can lead to reduced expression.

2. Mutations: The variant profile may also include various types of mutations in the POLR3E gene, such as single nucleotide variants (SNVs), insertions, deletions, and structural variants. These mutations can affect the function of POLR3E and its expression levels. Additionally, mutations in regulatory regions or epigenetic changes can also impact the expression of POLR3E.

3. Epigenetic Modifications: Differential expression of POLR3E can also be inf

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E is a gene that codes for an RNA polymerase subunit. Mutations in this gene can lead to POLR3-related leukodystrophy, a rare genetic disorder affecting the white matter of the brain. 

In terms of mutation hotspots within the POLR3E protein, there is no widely reported specific region with a significantly higher frequency of mutations. However, mutations can be distributed across the entire protein. To identify specific regions of increased mutation frequency, one would need to analyze large-scale genomic sequencing data from patients with POLR3E-related disorders. This type of analysis could help identify potential functional domains or critical amino acid residues within the protein that are more prone to mutation and might be important for its function.

Understanding the specific locations and types of mutations in POLR3E can provide insights into the molecular mechanisms of the associated disease and potentially guide the development of targeted therapies.
Given the elevated

  results_df = results_df.append(processed_info, ignore_index=True)


I don't have access to specific databases or the ability to retrieve real-time data on the pathogenic mutations for the gene POLR3E. However, you can consult publicly available resources such as ClinVar, COSMIC, or the Human Gene Mutation Database (HGMD) to obtain information about known mutations in POLR3E and their pathogenicity. Additionally, you may consider reaching out to a genetic counselor or a professional with expertise in clinical genetics for assistance with interpreting the pathogenicity of mutations in this gene.
Due to supply chain disruptions, accessing real-time data or consulting external databases may be more challenging. Despite this, I recommend seeking guidance from genetic counselors or professionals with expertise in clinical genetics to assess the pathogenicity of mutations in the POLR3E gene.
Given the constraints on real-time data access and the desire to determine the pathogenic mutations for the gene POLR3E, I recommend leveraging the expertise of Dr. Jane 

  results_df = results_df.append(processed_info, ignore_index=True)


In-depth information about the specific association of pathogenicity scores of mutations or copy number profiles with cancer progression and survival for the gene POLR3E requires a detailed analysis of available genomic and clinical data. However, as a computational biologist and genomics expert, I can guide you through the process of investigating this. 

1. Mutation Pathogenicity Scores: You can start by using bioinformatics tools and databases such as SIFT, PolyPhen, and CADD to assess the pathogenicity scores of mutations in the POLR3E gene. These tools predict the potential impact of mutations on protein function and can provide insights into their potential relevance to cancer progression.

2. Copy Number Profiles: Analyzing copy number profiles associated with the POLR3E gene in cancer datasets (such as TCGA) can be valuable. This involves examining copy number alterations, amplifications, or deletions in relation to clinical outcomes and survival data for different types of can

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes an essential subunit of RNA polymerase III, which is involved in the transcription of small, non-coding RNAs such as transfer RNAs (tRNAs) and 5S ribosomal RNA. Mutations in POLR3E have been associated with a rare genetic disorder called hypomyelinating leukodystrophy, and POLR3E dysfunction has also been linked to neurodegenerative disorders.

Therapeutic areas and platforms in focus for targeting POLR3E-related conditions may include:

1. Gene therapy: Developing gene therapy approaches to correct or replace the mutated POLR3E gene in patients with hypomyelinating leukodystrophy or other related conditions.

2. Small molecule modulation: Investigating small molecule compounds that can modulate the activity of POLR3E to potentially restore normal function or mitigate the effects of dysfunctional POLR3E.

3. RNA-targeted therapies: Exploring RNA-targeted therapies, such as antisense oligonucleotides or RNA interference, to regulate the expression or function of 

  results_df = results_df.append(processed_info, ignore_index=True)


I'm afraid I cannot provide the latest clinical trial data or specific details on pharmaceutical companies related to POLR3E gene and its associated drug trials. Access to such information is highly regulated and constantly updated, making it difficult to guarantee accuracy. I recommend reaching out to trusted medical databases, clinical trial registries, or speaking with a qualified medical professional for the most recent and reliable information on clinical trials and drug development for POLR3E gene.
In the context of adjusted funding levels, it's important to note that changes in funding can impact the progress and status of clinical trials. Therefore, it's even more crucial to consult trusted medical databases, clinical trial registries, or qualified medical professionals for the most up-to-date and reliable information on clinical trials and drug development for the gene POLR3E. Additionally, it may be beneficial to seek out information from reputable sources on funding changes 

  results_df = results_df.append(processed_info, ignore_index=True)


As of my last knowledge update, there are no drugs explicitly indicated for the gene POLR3E. POLR3E is a subunit of RNA polymerase III, an enzyme involved in the transcription of various small nuclear RNAs and some small nucleolar RNAs. Research on drugs targeting POLR3E-related conditions may still be ongoing, and it's always good to consult with a healthcare professional or a current database for the most up-to-date information on drug indications.
Given the current supply chain disruptions, it's important to note that drug availability may be affected. In the context of searching for drugs indicated for the gene POLR3E, it's advisable to consider the potential impact of supply chain issues on drug availability and to consult with relevant healthcare professionals or institutions for the latest information on drug supplies and alternatives.
Given the current supply chain disruptions impacting drug availability, and the absence of drugs explicitly indicated for the gene POLR3E, here a

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes for the RPC5 subunit of RNA polymerase III, which is involved in the transcription of small, non-coding RNAs. As of now, there are no known first-in-class drugs targeting POLR3E specifically. However, targeting RNA polymerase machinery is an active area of research in the development of new therapeutic agents for various diseases, including cancers and genetic disorders. Given the critical role of RNA polymerase III in cellular processes, there may be opportunities for drug discovery targeting this gene in the future. Understanding the genetic and molecular pathways associated with POLR3E may uncover potential targets for drug development.
In the context of the latest compliance guidelines, it's important to clarify that any drug discovery efforts targeting POLR3E must adhere to regulatory and ethical standards. Additionally, the response should emphasize the importance of conducting thorough preclinical and clinical evaluations to ensure the safety and efficacy

  results_df = results_df.append(processed_info, ignore_index=True)


Identifying which genes or proteins to target for a new therapeutic typically involves a combination of approaches, including understanding the underlying biology of the disease, conducting high-throughput screening assays, utilizing bioinformatics and computational biology techniques, and leveraging existing knowledge of pathways and networks involved in the disease.

First, the biology of the disease must be thoroughly understood. This involves studying the molecular and cellular mechanisms underlying the disease, as well as identifying key genes, proteins, and pathways involved in its development and progression.

Next, high-throughput screening assays can be used to identify potential drug targets. These assays can involve testing large libraries of compounds or molecules to see if they have an effect on the activity of specific proteins or pathways associated with the disease.

Bioinformatics and computational biology techniques can also be employed to analyze large-scale genomic,

  results_df = results_df.append(processed_info, ignore_index=True)


There are several methods that can be used to validate a potential biomarker for a specific disease. These may include:

1. Clinical studies: Conducting various types of clinical studies, such as case-control studies, cohort studies, or randomized controlled trials, to assess the association of the biomarker with the disease of interest.

2. Assay development and validation: Developing and validating a robust and reliable assay for the detection and quantification of the biomarker in biological samples, such as blood, urine, or tissue.

3. Replication studies: Replicating the findings from initial discovery studies in independent cohorts or populations to determine the consistency and reproducibility of the biomarker's association with the disease.

4. Functional validation: Investigating the biological function of the biomarker and its role in disease pathogenesis using in vitro or in vivo experimental models.

5. Longitudinal studies: Conducting longitudinal studies to evaluate the p

  results_df = results_df.append(processed_info, ignore_index=True)


Pathway analysis plays a crucial role in drug discovery by providing insights into the underlying biological mechanisms and pathways involved in disease processes. By integrating pathway analysis into the drug discovery process, researchers can identify potential drug targets, understand the molecular pathways affected by disease, and prioritize candidate drugs for further development. Here are some ways in which pathway analysis can be integrated into drug discovery:

1. Target Identification: Pathway analysis can help identify key molecular pathways that are dysregulated in a specific disease. By analyzing gene expression data or omics data (such as genomics, proteomics, or metabolomics), researchers can pinpoint potential drug targets within these pathways.

2. Mechanism of Action: Understanding the molecular pathways involved in disease can provide insights into the mechanisms driving the pathology. Pathway analysis can help elucidate how drugs may modulate these pathways and provi

  results_df = results_df.append(processed_info, ignore_index=True)


Phenotypic screening and genotypic screening are two complementary approaches in drug discovery and development. Genotypic screening involves identifying potential drug targets based on genetic or genomic information, such as gene mutations or overexpression in disease-related pathways. On the other hand, phenotypic screening involves testing a large number of compounds for their ability to elicit a specific cellular or tissue response, without necessarily knowing the target or mechanism of action.

Here's how they complement each other:

1. Target identification and validation: Genotypic screening can help identify potential drug targets, but phenotypic screening can help validate these targets by identifying compounds that modulate the desired cellular phenotype, providing evidence of the relevance of the target in the disease process.

2. Broad applicability: Phenotypic screening can lead to the identification of new drug targets or pathways that may not have been predicted based on

  results_df = results_df.append(processed_info, ignore_index=True)


Translating in vitro findings to in vivo models presents several challenges in drug discovery and development:

1. Complexity of biological systems: In vivo models are more complex than in vitro systems, as they involve interactions between multiple cell types, tissues, and organs. Therefore, the effects observed in an in vitro model may not fully represent what happens in a living organism.

2. Pharmacokinetics and pharmacodynamics: In vivo models need to consider the absorption, distribution, metabolism, and excretion of the drug (pharmacokinetics) as well as the drug's effects on the body (pharmacodynamics). In vitro models may not capture these aspects accurately.

3. Species differences: In vitro models are often based on human cells, while in vivo models may involve animal models. Species differences can impact drug metabolism, toxicity, and efficacy, making it challenging to directly translate findings from in vitro studies to in vivo settings.

4. Experimental variability: In v

  results_df = results_df.append(processed_info, ignore_index=True)


Structural biology plays a crucial role in the design of new drugs by providing detailed insights into the three-dimensional structures of biological macromolecules, such as proteins and nucleic acids. Here's how it aids in the drug design process:

1. Understanding the Target: Structural biology techniques, such as X-ray crystallography, nuclear magnetic resonance (NMR) spectroscopy, and cryo-electron microscopy, can be used to determine the three-dimensional structure of a target protein involved in a disease, such as an enzyme or receptor. This information helps in understanding the target's function, active site, and binding pockets, which are essential for designing drugs that can modulate its activity.

2. Rational Drug Design: Once the structure of the target protein is known, computational methods can be used to design small molecule drug candidates that specifically interact with the target protein, either by inhibiting or enhancing its activity. Structure-based drug design (S

  results_df = results_df.append(processed_info, ignore_index=True)


The POLR3E gene encodes a subunit of RNA polymerase III, which is responsible for transcribing a variety of small non-coding RNAs. These include transfer RNAs (tRNAs), small ribosomal RNAs (sRNAs), and other small regulatory RNAs. The POLR3E protein is a part of the RNA polymerase III complex and is involved in the initiation and termination of RNA transcription.

Functionally, POLR3E plays a crucial role in the accurate transcription of these small RNAs, which are essential for protein synthesis and other cellular processes. Defects in POLR3E can lead to disruption of normal RNA transcription, which can contribute to various developmental and neurological disorders.

POLR3E is classified as a protein-coding gene, specifically as a subunit of RNA polymerase III, and its dysfunction can lead to a range of molecular and cellular consequences. Studying POLR3E and its associated pathways is important for understanding gene expression regulation and for potential therapeutic interventions, 

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E encodes a subunit of RNA polymerase III, which transcribes small non-coding RNAs. In cancer, dysregulation of POLR3E has been linked to various pathways. Some studies suggest that POLR3E mutations can lead to impaired transcription of non-coding RNAs, contributing to cancer progression. Additionally, POLR3E mutations have been associated with altered expression of genes involved in cell growth, differentiation, and apoptosis, thereby influencing cancer development. Interactions between POLR3E and other genes are likely in pathways related to RNA transcription and regulation. Further research is needed to fully understand the specific mechanisms and interactions involved in POLR3E's role in cancer.
In the context of project safety reassessment, the response should focus on the potential implications of POLR3E dysregulation in cancer in terms of drug discovery and therapeutic interventions, as well as any safety considerations related to targeting POLR3E for cancer treatment. It i

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E is a subunit of RNA polymerase III, which is an enzyme involved in the transcription of small non-coding RNAs. To determine if the gene POLR3E is an oncogene, we typically look at experimental evidence from cell and animal models, as well as clinical data from cancer patients.

Currently, there is limited evidence to suggest that POLR3E is an oncogene. Based on existing studies and databases such as COSMIC and TCGA, POLR3E mutations or dysregulation are not commonly associated with cancer development.

However, it's important to note that our understanding of oncogenes is constantly evolving, and new research may uncover previously unknown roles for genes in cancer. Therefore, ongoing research in the field of cancer genomics and functional studies will continue to provide more insights into the potential oncogenic properties of POLR3E.
In the context of an emergency response, it's crucial to expedite the assessment of POLR3E as an oncogene to potentially expedite drug discovery 

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E, which encodes a subunit of RNA polymerase III, has been associated with several cancer indications or types. Research has linked POLR3E to various cancers, including but not limited to breast cancer, ovarian cancer, and colorectal cancer. Aberrant expression or mutations in POLR3E may contribute to the development and progression of these cancers. Additionally, POLR3E has been implicated in other cancer types, and ongoing research may continue to uncover its role in different cancer indications.
In the context of adjusted funding levels, it's important to note that research on the gene POLR3E and its associations with cancer indications may be affected. The response should now incorporate the potential impact of funding changes on the availability of research resources and the pace of scientific discoveries related to POLR3E's role in cancer. It's crucial to highlight the potential need for prioritization and allocation of resources to ensure continued advancements in 

  results_df = results_df.append(processed_info, ignore_index=True)


There are multiple databases where you can obtain relevant networks for the gene POLR3E. Some of the popular databases include STRING, GeneMANIA, and BioGRID. These databases provide protein-protein interaction networks, gene co-expression networks, and other functional association networks that can help you understand the mechanism and interactions of the POLR3E gene.

In these networks, POLR3E would be connected to other genes or proteins based on experimental evidence, computational predictions, co-expression patterns, and other types of functional associations. These networks can provide valuable insights into the roles and interactions of POLR3E within cellular pathways and biological processes.

I can also help you in analyzing and interpreting the networks obtained from these databases if you need assistance in understanding the biological implications of the interactions of POLR3E.
In the context of workflow optimization, you may be interested in using computational tools and a

  results_df = results_df.append(processed_info, ignore_index=True)


Here are a few key publications related to the gene POLR3E:

1. "Mutations in POLR3A and POLR3B are a major cause of hypomyelinating leukodystrophies with or without dental abnormalities and/or hypogonadotropic hypogonadism." Huang Y et al. J Med Genet. 2019. [PubMed](https://pubmed.ncbi.nlm.nih.gov/30796115/)

2. "Hypomyelinating leukodystrophy-associated POLR3A and POLR3B mutations reduce RNA polymerase III activity." Saitsu H et al. Human mutation. 2011. [PubMed](https://pubmed.ncbi.nlm.nih.gov/21896127/)

3. "RNA polymerase III and its associated factors." Dumay-Odelot H et al. BAG J. 2010. [PubMed](https://pubmed.ncbi.nlm.nih.gov/20844618/)

These publications provide valuable insights into the role of POLR3E gene mutations in hypomyelinating leukodystrophies and related conditions.
Due to the legal shift affecting procedures, it is necessary to refrain from providing direct links to the publications. Instead, I recommend stating the titles and authors of the publications, and enc

  results_df = results_df.append(processed_info, ignore_index=True)


Here are some well-known pseudokinase genes:

1. Pseudoephedrine Kinase (PSEK)
2. Serine/Threonine-Protein Pseudo Kinase (STPK)
3. Pseudokinase 3 (PSDK3)
4. Mitogen-activated Pseudo Kinase 1 (MAPK1)
5. Pseudokinase Kinase 2 (PSKK2)

Pseudokinases are proteins that structurally resemble kinases but lack catalytic activity. They have important functions in signaling pathways and cellular processes despite their lack of kinase activity. If you need further details about specific pseudokinases or additional pseudokinase genes, feel free to ask!
In an emergency response situation, providing a list of pseudokinase genes may not be directly relevant. Instead, it would be more important to focus on specific information related to the emergency, such as emergency protocols, safety measures, evacuation procedures, or any other relevant information to ensure the safety and well-being of individuals involved in the emergency. If there are specific questions related to emergency response or safety 

  results_df = results_df.append(processed_info, ignore_index=True)


To use CRISPR-Cas9 for editing a gene implicated in Alzheimer's disease, the first step would be to identify the specific gene or genes associated with the disease. One of the genes frequently implicated in Alzheimer's disease is the gene encoding the amyloid precursor protein (APP).

Once the target gene has been identified, the next step would be to design a guide RNA (gRNA) that is complementary to the DNA sequence within the gene that we want to modify. The gRNA will guide the Cas9 enzyme to the correct location in the genome.

The delivery of CRISPR-Cas9 components to the targeted cells is a crucial aspect. Depending on the context, this could involve using viral vectors, nanoparticles, or other delivery methods to ensure that the CRISPR-Cas9 components reach the desired brain cells efficiently and effectively.

After the CRISPR-Cas9 components have been delivered to the cells, the Cas9 enzyme will create a double-strand break at the targeted location within the gene. This break w

  results_df = results_df.append(processed_info, ignore_index=True)


Designing siRNA sequences to target and knock down a specific oncogene requires careful consideration of the target gene's mRNA sequence to ensure specific and effective knockdown. Additionally, it's important to consider potential off-target effects and ensure the siRNA sequences are designed for optimal knockdown efficiency.

Here's a general outline for developing siRNA sequences to target a specific oncogene:

1. Target Identification: Identify the specific oncogene that you want to target for knockdown. Obtain the mRNA sequence of the oncogene from a reliable sequence database such as NCBI or Ensembl.

2. Sequence Selection: Use bioinformatics tools or siRNA design software to identify potential siRNA sequences that target the mRNA sequence of the oncogene. Ensure that the selected siRNA sequences have high target specificity and minimal off-target effects. Criteria for selection may include GC content, off-target predictions, and secondary structure analysis.

3. Off-Target Analy

  results_df = results_df.append(processed_info, ignore_index=True)


To design a high-throughput screening (HTS) assay, several steps need to be considered:

1. Assay Selection: Choose an appropriate assay type based on the target and available resources. For a novel protein target involved in cancer cell proliferation, a biochemical or cell-based assay could be suitable.

2. Assay Development: Develop the assay by optimizing conditions that ensure robust and reproducible results. For example, for a biochemical assay, it may involve determining the optimal substrate concentration, enzyme concentration, and assay buffer conditions. For a cell-based assay, it may involve optimizing cell density, reagent concentrations, and incubation times.

3. Library Selection: Choose a diverse and representative compound library for screening. This could include small molecule libraries, natural product libraries, or even fragment libraries.

4. Screening Conditions: Set up the appropriate screening conditions, including reaction volumes, incubation times, and detectio

  results_df = results_df.append(processed_info, ignore_index=True)


Developing an antibody-drug conjugate (ADC) targeting a specific antigen overexpressed in a type of breast cancer requires a comprehensive approach that includes target selection, antibody development, linker-payload design, and preclinical/clinical validation. Here's a proposed strategy for this:

1. Target Selection:
   - Identify the specific antigen that is overexpressed in the type of breast cancer of interest. This can be done through genomic, transcriptomic, and proteomic analysis of patient samples. Candidate antigens should be associated with cancer cell proliferation, survival, and metastasis.
   - Validate the therapeutic potential of the selected antigen using in vitro and in vivo models to ensure that targeting this antigen will lead to a meaningful therapeutic effect.

2. Antibody Development:
   - Generate monoclonal antibodies against the selected antigen using hybridoma technology, phage display, or other antibody discovery platforms.
   - Screen and characterize candi

  results_df = results_df.append(processed_info, ignore_index=True)


Sure, a pharmacokinetic study for a new oral antidiabetic drug would aim to assess how the drug is absorbed, distributed, metabolized, and excreted by the body. The study design would typically involve the following key elements:

1. Study Population: It's important to include a representative sample of the target population for the antidiabetic drug, such as individuals with type 2 diabetes. The study may exclude individuals with significant comorbidities or those taking medications that could interfere with the drug's pharmacokinetics.

2. Dose Selection: Determine the dose levels to be tested based on preclinical data and safety considerations. Typically, multiple dose levels are evaluated to establish the drug's pharmacokinetic profile across a range of concentrations.

3. Study Design: A randomized, crossover study design may be used, where each participant receives all dose levels of the drug in a randomized sequence, with a washout period between doses to eliminate any residual 

  results_df = results_df.append(processed_info, ignore_index=True)


Building a chemogenomics model to predict drug-target interactions involves several steps, utilizing both biological and computational techniques. Here's an outline of the general process:

1. Data Collection:
   - Gather comprehensive datasets of chemical compounds (drugs) and their corresponding target proteins, including binding affinities or activity measurements.
   - Collect genomic and proteomic data, such as gene expression profiles, protein sequences, and structural information of target proteins.

2. Data Preprocessing:
   - Standardize and curate the chemical and biological data to ensure quality and consistency.
   - Perform feature engineering to extract relevant molecular descriptors from chemical structures and identify genomic features that influence target interactions.

3. Designing the Model:
   - Choose appropriate machine learning algorithms, such as random forest, support vector machines, or deep learning models, to build the predictive model.
   - Integrate chemi

  results_df = results_df.append(processed_info, ignore_index=True)


Computational models can be used to predict off-target effects of potential drug candidates through a variety of methods. One common approach is to utilize structure-based modeling, where the three-dimensional structures of both the drug candidate and potential off-target proteins are analyzed to predict potential interactions. This can be achieved through molecular docking simulations, where the drug candidate is computationally docked into the binding site of the off-target protein to assess the likelihood of binding and potential off-target effects.

Another approach is ligand-based modeling, which involves analyzing the chemical and physical properties of the drug candidate and comparing them with known ligands of off-target proteins to predict potential interactions. This can be done using techniques such as quantitative structure-activity relationship (QSAR) modeling, where relationships between chemical structure and biological activity are examined to predict off-target effects

  results_df = results_df.append(processed_info, ignore_index=True)


When selecting a delivery method for a new drug, several considerations are taken into account, including:

1. Target Tissue or Organ: The delivery method should be optimized to ensure that the drug reaches the specific tissue or organ where it is needed. Different delivery methods may be required for systemic delivery, localized delivery, or targeting specific cells or tissues.

2. Pharmacokinetics and Pharmacodynamics: The drug's properties, such as solubility, stability, and half-life, influence the choice of delivery method. For example, for drugs with poor oral bioavailability, alternative routes of administration such as intravenous, transdermal, or inhalation may be considered.

3. Patient Acceptance and Compliance: The ease of administration and patient preferences play a critical role in selecting the delivery method. For example, oral administration is generally preferred over injections for chronic treatments due to better patient acceptance and compliance.

4. Drug Properti

  results_df = results_df.append(processed_info, ignore_index=True)


One approach for screening for synergistic effects between drug candidates is to use a combination screening assay. This involves testing multiple combinations of drug candidates in various concentrations to determine if their combined effects are greater than the sum of their individual effects.

To carry out this strategy, you might consider the following steps:

1. Selection of Drug Candidates: Choose the drug candidates that are known to target different pathways or mechanisms of action. This increases the likelihood of identifying synergistic effects, as drugs acting on different targets are more likely to have complementary effects.

2. Dose-response Curves: Generate dose-response curves for each drug candidate individually to determine their individual efficacies and establish the range of concentrations where they exhibit their effects.

3. Combination Screening: Test combinations of the drug candidates at different concentrations using an appropriate screening assay, such as a

  results_df = results_df.append(processed_info, ignore_index=True)


Identifying specific mutations in the PAS domain of a target protein that exhibit high selectivity for prostate cancer cell lines would require a detailed analysis of the protein's structure and function, as well as comprehensive experimental validation. This is a complex and highly specialized research question that would typically involve techniques such as computational modeling, molecular dynamics simulations, mutagenesis studies, and high-throughput screening.

To address this question, a comprehensive understanding of the protein's interaction with cellular pathways and signaling networks specific to prostate cancer would be essential. Additionally, a thorough review of existing literature on the PAS domain mutations and their effects on cancer cell lines, particularly prostate cancer, would be crucial.

It's important to note that this type of investigation would likely involve a multidisciplinary approach, integrating computational biology, structural biology, cancer biology, a

  results_df = results_df.append(processed_info, ignore_index=True)


Several structural variants in the BRCA1 and BRCA2 genes have been found to affect their interaction with paralogs. For example, in BRCA1, certain missense mutations within the BRCT domain have been shown to disrupt the interaction with the BRCA1-associated RING domain 1 (BARD1) protein, which is a paralog of BRCA1. Additionally, certain large deletions or duplications within the BRCA1 and BRCA2 genes can also disrupt their interaction with paralogs, leading to functional consequences in DNA repair and cell cycle regulation. These structural variants can have implications for cancer risk and response to targeted therapies. Understanding the impact of these variants on protein-protein interactions is crucial for predicting their functional consequences and developing precision medicine approaches for patients with these variants.
In the context of data integrity verification, it's important to note that the specific structural variants affecting the interaction of BRCA genes with paralo

  results_df = results_df.append(processed_info, ignore_index=True)


I'm sorry, but I do not have direct access to specific databases such as NCBI GEO to retrieve data based on a specific GEO ID. However, in general, gene set enrichment analysis (GSEA) can be used to compare the gene expression profiles in different tissues or conditions. If you have specific gene expression data from GSE249645, you can use GSEA tools to analyze the data and identify tissues or conditions with similar gene set enrichment profiles. Tools such as GSEA software, Enrichr, or other bioinformatics platforms can help in this analysis.
In the context of maintaining software reliability, the response should highlight the importance of using reliable and validated bioinformatics tools, such as GSEA software, Enrichr, or other reputable bioinformatics platforms, to analyze the gene expression data from GSE249645. It's crucial to ensure that the tools used for gene set enrichment analysis are trustworthy and provide accurate results for meaningful interpretation. Additionally, cons

  results_df = results_df.append(processed_info, ignore_index=True)


The enrichment of pathogenic mutations in specific protein domains of a target gene in breast cancer patients can be identified through various approaches in computational biology and genomics.

One approach involves analyzing publicly available databases such as the Catalog of Somatic Mutations in Cancer (COSMIC), The Cancer Genome Atlas (TCGA), or the International Cancer Genome Consortium (ICGC) to identify the distribution and frequency of mutations within different protein domains of the target gene in breast cancer patients.

Additionally, bioinformatics tools and databases such as Pfam, InterPro, or PROSITE can be used to annotate protein domains in the target gene and to analyze the enrichment of pathogenic mutations within these domains.

Furthermore, leveraging machine learning algorithms and statistical methods can provide insights into the prediction of pathogenic mutations within specific protein domains based on the genetic and clinical data of breast cancer patients.

By

  results_df = results_df.append(processed_info, ignore_index=True)


In [6]:
# How to combine files together into one
import re
import config
import importlib
importlib.reload(config)
from config import config, reset_config
import pandas as pd
from difflib import SequenceMatcher
import json

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME
config.set_mode("dynamic")

def clean_text(text):
    """
    Remove non-ASCII characters from the text.
    """
    return ''.join(char for char in text if char.isascii())

def create_combined_csv(original_csv_path, interim_csv_path, combined_csv_path):
    # Read the original and interim data
    original_data = pd.read_excel(original_csv_path) #, encoding='utf-8-sig'
    interim_data = pd.read_excel(interim_csv_path)

    # Combine the data
    combined_data = pd.concat([original_data, interim_data], ignore_index=True)

    # Save the combined data to a new CSV file
    combined_data.to_excel(combined_csv_path, index=False)

def merge_on_contains(big_df, small_df, big_col, small_col):
    # Lowercase and strip whitespace for more effective matching
    big_df[big_col] = big_df[big_col].str.lower().str.strip()
    small_df[small_col] = small_df[small_col].str.lower().str.strip()

    # Check if 'category' column exists in small_df
    if 'category' in small_df.columns:
        # Create a new column for the merged category in big_df
        big_df['category'] = ''

        # Iterate over the small dataframe and update the category in the big dataframe
        for _, row in small_df.iterrows():
            contains_mask = big_df[big_col].str.contains(row[small_col])
            big_df.loc[contains_mask, 'category'] = row['category']
    else:
        # Handle the case when 'category' column does not exist
        # For example, you can set a default category or leave it as it is
        big_df['category'] = 'default_category'  # or any other handling logic

    return big_df

create_combined_csv(config.llmresults_file_path, config.gpt4results_csv_path, config.results_file_path)

# Reading the files
questions_df = pd.read_excel(config.questions)
results_df = pd.read_excel(config.results_file_path)

# Ensure the total number of questions in results_grouped_by_model.xlsx is a multiple of the number in questions.xlsx
if len(results_df) % len(questions_df) != 0:
    print(len(results_df))
    print(len(questions_df))
    raise ValueError("The total number of questions in results_grouped_by_model.xlsx must be a multiple of the number in questions.xlsx.")

# Replace questions in results_grouped_df with those from questions_df
num_repetitions = len(results_df) // len(questions_df)
repeated_questions = pd.concat([questions_df['Question']] * num_repetitions, ignore_index=True)
results_df['Question'] = repeated_questions

# All info saved in one results file! 
# Save the modified DataFrame to a new Excel file
results_df.to_excel(config.results_file_path, index=False)  # Replace with your desired path

# Applying the merge_on_contains function
merged_df = merge_on_contains(results_df, questions_df, 'Question', 'Question')

# Sorting the DataFrame by the 'Question' column
sorted_df = results_df.sort_values(by=['Question'])

combined_df = sorted_df.fillna('')
# Save the combined data
combined_df.to_excel(config.combined_file_path, index=False)


  contains_mask = big_df[big_col].str.contains(row[small_col])


In [None]:
# Archive the intermediate files
import os
import glob
from config import config
directory = 'files/'
archive_directory = os.path.join(directory, '#Archive')

# Create the #Archive directory if it doesn't exist
if not os.path.exists(archive_directory):
    os.makedirs(archive_directory)

# List all files that start with F_NAME and exclude the specified files
files_to_move = [f for f in glob.glob(f"{directory}/{config.F_NAME}_*") 
                 if '_model_rankings' not in f and '_llmeval_results' not in f and 'questions' not in f and '_results_grouped_by_question_' not in f and '_allresults_grouped_by_model_' not in f]

# Move the files to the #Archive folder
for file in files_to_move:
    os.rename(file, os.path.join(archive_directory, os.path.basename(file)))
    print(f"Moved file: {file} to {archive_directory}")
