In [1]:
import json
import random

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def get_random_perturbation(perturbations):
    category = random.choice(list(perturbations.keys()))
    perturbation = random.choice(list(perturbations[category].items()))
    return category, perturbation

# Usage
file_path = 'perturbations.json'
perturbations = load_file(file_path)
random_category, random_perturbation = get_random_perturbation(perturbations)
print(f"Category: {random_category}, Perturbation: {random_perturbation[0]}, Description: {random_perturbation[1]}")


Category: Regulatory and Compliance, Perturbation: Legal Shift, Description: Due to new legal precedents affecting our procedures


In [5]:
# LLM dynamic evaluation

import replicate
import pandas as pd
import json
import os
from dotenv import load_dotenv
load_dotenv()
folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

with open('config.json', 'r') as config_file:
    config = json.load(config_file)

INSTRUCTION = config['instructions']
F_NAME = config["name"]

# # Load the file
df = pd.read_excel('files/questions_dynamic.xlsx')
# Save the original DataFrame
df.to_excel('files/questions_original_dynamic.xlsx', index=False)

# Trim whitespace and newline characters
df['Question'] = df['Question'].str.strip()  # Removes leading/trailing whitespace

# Check for duplicate questions
duplicates = df.duplicated(subset=['Question'], keep=False)
if duplicates.any():
    print("Duplicates found. Removing duplicates.")

    # Remove duplicates, keeping the first occurrence
    df = df.drop_duplicates(subset=['Question'], keep='first')

    # Save the modified DataFrame, overwriting the original 'questions.xlsx'
    df.to_excel('files/questions_dynamic.xlsx', index=False)
else:
    print("No duplicates found.")

perturbations = load_file('perturbations.json')
knowledgebase = load_file('knowledgebase.json')

# Modify DataFrame to include new columns
results_df = pd.DataFrame(columns=['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response'])

models = {
    # "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    "falcon-40b": "joehoover/falcon-40b-instruct:7d58d6bddc53c23fa451c403b2b5373b1e0fa094e4e0d1b98c3d02931aa07173",
    # "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    "llama2-70b": "meta/llama-2-70b-chat",
    # "openhermes2": "antoinelyset/openhermes-2.5-mistral-7b:d7ccd25700fb11c1787c25b580ac8d715d2b677202fe54b77f9b4a1eb7d73e2b",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e",
    # "deepseek_33bq": "kcaverly/deepseek-coder-33b-instruct-gguf:ea964345066a8868e43aca432f314822660b72e29cab6b4b904b779014fe58fd",
    }

def generate_prompt(model_key, instruction, question):
    prompt_for_qwen = "system\n {instruction}. Please try your best to answer the following question. \nuser\n{question}\nassistant\n"
    prompt_for_hermes = """[
    {{
      "role": "system",
      "content": "{instruction}. Please try your best to answer the following question." 
    }},
    {{
      "role": "user",
      "content": {question}
    }}
    ]"""

    if model_key in ["yi-34b", "qwen-14b"]:
        return prompt_for_qwen.format(instruction=instruction, question=question)
    elif model_key == "openhermes2":
        return prompt_for_hermes.format(instruction=instruction, question=question)
    else:
        # plain_text_question = json.loads(question)
        return f"{instruction}. Please try your best to answer the following question. {question}"

prompt_for_qwen="""<|im_start|>system\n {INSTRUCTION}. Please try your best to answer the following question. <|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": "{INSTRUCTION}. Please try your best to answer the following question." 
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

def ask_llm(model_value, prompt):   
    output = replicate.run(
        model_value,
        input={
            "debug": False,
        #   "top_k": 50,
            "top_p": 0.9,
            "prompt": prompt,
            "temperature": 0.7,
            "max_new_tokens": 500,
            "min_new_tokens": -1
        }
    )
    response = ""
    for item in output:
        item_str = str(item)  # Convert item to string
        response += item_str if len(item_str) == 1 else f" {item_str}"
    response = response.strip()
    return response

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        qn = row['Question']
        question = json.dumps(qn)
        prompt = generate_prompt(model_key, INSTRUCTION, question)
        try:
            print(f"{model_key}: {prompt}")
            response = ask_llm(model_value, prompt)
            
        except Exception as e:
            response = f"Error: {e}"

        # Perturb the question and get the response
        category, (perturbation, description) = get_random_perturbation(perturbations)
        perturbed_qn = f"{qn}\nResponse: {response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
        prompt = generate_prompt(model_key, INSTRUCTION, perturbed_qn)
        perturbed_response = ask_llm(model_value, perturbed_qn)

        # Evaluate sufficiency or suggest alternate course
        final_analysis_qn = f"Original Question: {question}\nOriginal Response: {response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase: {knowledgebase}\nNow consider the knowlegebase, what else ought we to do?"
        prompt = generate_prompt(model_key, INSTRUCTION, final_analysis_qn)
        final_analysis_response = ask_llm(model_value, final_analysis_qn)

        # Record each stage
        new_row = {
            'Model': model_key, 
            'Question': qn, 
            'Response': response, 
            'Perturbed Question': perturbed_qn, 
            'Perturbed Response': perturbed_response, 
            'Final Analysis Question': final_analysis_qn, 
            'Final Analysis Response': final_analysis_response
        }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

        if index % 10 == 0:
            results_df.to_excel(f'files/{F_NAME}_results_grouped_by_model_dynamic.xlsx', index=False, sheet_name='Sheet1')
            
results_df.to_excel(f'files/{F_NAME}_results_grouped_by_model_dynamic.xlsx', index=False, sheet_name='Sheet1')

No duplicates found.
You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following question. "Which cell lines have high dependency for the target of interest for the gene POLR3E?"
You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following question. "Which indications are the dependent cell lines for the gene POLR3E enriched in? Are there lineages with high selectivity?"
You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following question. "Is there a concordance in the variant profile seen in cell lines with that of patients for the gene POLR3E?"
You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. Please try your best to answer the following questio

In [6]:
# GPT-4 dynamic evaluation

import pandas as pd
import json
import openai
import requests
from openai import OpenAI
import time
from dotenv import load_dotenv
load_dotenv()
import os
folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

config = load_file('config.json')
perturbations = load_file('perturbations.json')
knowledgebase = load_file('knowledgebase.json')

with open('config.json', 'r') as config_file:
    config = json.load(config_file)

INSTRUCTION = config['instructions']
F_NAME = config["name"]
GPT_MODEL = config["GPT_MODEL"]
INPUT_CSV_PATH = 'files/questions_dynamic.xlsx'
OUTPUT_CSV_PATH = f'files/{F_NAME}_results_gpt4_dynamic.xlsx'

client = OpenAI()
def show_json(obj):
    print(json.loads(obj.model_dump_json()))

assistant = client.beta.assistants.create(
    name=f"{F_NAME} AI Dynamic Evaluator",
    instructions=INSTRUCTION,
    model=GPT_MODEL,
)
show_json(assistant)

# Utility functions
def read_csv(file_path):
    return pd.read_excel(file_path)

def process_data_for_gpt(data):
    prompts = []
    for _, row in data.iterrows():
        question = row['Question']
        prompt = f"Please try your best to answer the following question.:\n\n{question}"
        prompts.append(prompt)
    return prompts

def submit_message_and_create_run(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=prompt)
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def wait_on_run_and_get_response(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(0.5)
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    return [m.content[0].text.value for m in messages if m.role == 'assistant']

def ask_gpt4(prompt, ASSISTANT_ID):
    run, thread = submit_message_and_create_run(ASSISTANT_ID, prompt)
    response = wait_on_run_and_get_response(run, thread)
    if isinstance(response, list):
        response = ' '.join(map(str, response))
    response = response.replace("\\\\n", "\\n")
    response = response.strip()
    print(response)
    return response

def process_question_with_gpt4(row, assistant_id):
    original_question = row['Question']
    category = row.get('Category', 'Static')  # Default to 'Static' if not present
    if category == "Dynamic":
        # Dynamic question processing logic
        first_response = ask_gpt4(original_question, assistant_id)
        category, (perturbation, description) = get_random_perturbation(perturbations)
        perturbed_qn = f"{original_question}\nResponse: {first_response}\nChange in circumstances: {perturbation} - {description}\n What should change in the response?"
        perturbed_response = ask_gpt4(perturbed_qn, assistant_id)
        final_analysis_qn = f"Original Question: {original_question}\nOrig Response: {first_response} \nPerturbation ({category}): {perturbation} - {description}\n {perturbed_response}\nKnowledgebase Content: {knowledgebase}\n Now consider the knowlegebase, what else ought we to do?"
        final_analysis_response = ask_gpt4(final_analysis_qn, assistant_id)
    else:
        # Static question processing logic
        first_response = ask_gpt4(original_question, assistant_id)
        perturbed_qn = perturbed_response = final_analysis_qn = final_analysis_response = "n/a"

    return {
        'Model': GPT_MODEL,
        'Question': original_question, 
        'Response': first_response, 
        'Perturbed Question': perturbed_qn, 
        'Perturbed Response': perturbed_response, 
        'Final Analysis Question': final_analysis_qn, 
        'Final Analysis Response': final_analysis_response
    }

# Modify DataFrame to include new columns
new_data_columns = ['Model', 'Question', 'Response', 'Perturbed Question', 'Perturbed Response', 'Final Analysis Question', 'Final Analysis Response']
results_df = pd.DataFrame(columns=new_data_columns)
data = read_csv(INPUT_CSV_PATH)
prompts = process_data_for_gpt(data)
ASSISTANT_ID = assistant.id

# Process each question
for index, row in data.iterrows():
    processed_info = process_question_with_gpt4(row, ASSISTANT_ID)
    results_df = results_df.append(processed_info, ignore_index=True)

# Save the results
results_df.to_excel(OUTPUT_CSV_PATH, index=False)


{'id': 'asst_ggoUCxpIYUuTMyiQN9sb8AKU', 'created_at': 1706166460, 'description': None, 'file_ids': [], 'instructions': 'You are an exceptional computational biologist and genomics expert and know everything about drug discovery.', 'metadata': {}, 'model': 'gpt-3.5-turbo-1106', 'name': 'galen AI Dynamic Evaluator', 'object': 'assistant', 'tools': []}
As an exceptional computational biologist and genomics expert with a deep understanding of drug discovery, I can tell you that the dependency of a specific gene target such as POLR3E can vary across different cell lines. To identify cell lines with a high dependency on a particular gene, researchers typically use CRISPR screens or RNA interference (RNAi) technology to systematically knock down the expression of the gene and then assess the impact on cell viability or growth.

Several databases and tools, such as the Cancer Dependency Map (DepMap) project, provide comprehensive datasets on gene dependencies across a wide range of cancer cell

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes a subunit of RNA polymerase III, which is responsible for transcribing small noncoding RNAs. In terms of its association with specific cell lineages and indications, the enrichment of POLR3E in certain cell lines can provide insights into its potential role in specific biological processes or diseases. However, as of my last knowledge update, I don't have specific information on POLR3E enrichment in particular cell lineages or indications.

To assess whether POLR3E is enriched in specific cell lineages or disease indications, a comprehensive analysis of gene expression data across a wide range of cell types and disease contexts may be necessary. This analysis could involve examining publicly available gene expression databases, such as the Cancer Cell Line Encyclopedia (CCLE) or the Gene Expression Omnibus (GEO), to identify the cell lines and indications in which POLR3E shows enrichment.

If you have access to relevant gene expression data and analysis tools, y

  results_df = results_df.append(processed_info, ignore_index=True)


The concordance in the variant profile between cell lines and patient samples for a particular gene such as POLR3E can vary depending on the specific variants and the context in which they are being studied. Several factors can influence the concordance, including the genetic background of the cell lines, the specific patient population being studied, and the methods used for variant detection and analysis.

In general, it is important to carefully consider the specific variants and their potential impact on the function of the POLR3E gene. Variants that are known to be pathogenic or associated with specific disease phenotypes in patients should be evaluated for concordance in cell line models. Additionally, understanding the functional consequences of the variants, such as their effect on gene expression, protein function, or cellular pathways, can provide further insights into the relevance of the concordance between cell lines and patients.

If you have specific variant data for POL

  results_df = results_df.append(processed_info, ignore_index=True)


Associating expression and variant profiles for the gene POLR3E involves analyzing data to understand how genetic variants may impact gene expression levels. To do this, you can utilize various approaches such as expression quantitative trait loci (eQTL) analysis, which identifies genetic variants that are associated with gene expression levels.

In the case of POLR3E, you would first gather expression data (e.g., RNA-seq data) and variant data (e.g., whole-genome sequencing data) from relevant biological samples. Then, you can perform eQTL analysis to identify genetic variants that may influence the expression of POLR3E. This analysis can provide insights into how specific genetic variants may contribute to variations in POLR3E expression across different individuals.

Additionally, you can also explore the impact of genetic variants on the protein sequence of POLR3E, potentially leading to changes in protein function. This can be achieved through in silico prediction tools and functi

  results_df = results_df.append(processed_info, ignore_index=True)


I don't have direct access to specific databases or proprietary platforms, but I can guide you on how to find the information you need.

The gene POLR3E encodes a subunit of RNA polymerase III, which is involved in the transcription of various small non-coding RNAs. To find studies related to POLR3E using cell lines, you can start by searching in public databases such as NCBI Gene, PubMed, or NCBI GEO.

1. NCBI Gene: Visit the website and search for POLR3E. Under the "External Links" section, you can find related data in GEO or other databases.

2. PubMed: Search for "POLR3E cell line" in the PubMed database. This might give you access to relevant studies that have used cell lines in the context of POLR3E.

3. NCBI GEO: You can directly search for POLR3E in NCBI GEO to find relevant experimental data related to cell lines and POLR3E. The dataset links or analysis can be found within the platform.

If you need further assistance, feel free to ask for more specific guidance.


  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes for the third largest subunit of RNA polymerase III, which is an essential enzyme involved in the transcription of various small, non-coding RNA molecules. To determine the differential expression of POLR3E in cancer vs normal tissue, one can analyze data from high-throughput gene expression studies such as those available in public databases like The Cancer Genome Atlas (TCGA) or Gene Expression Omnibus (GEO).

Upon analysis of existing literature and datasets, it has been demonstrated that POLR3E is overexpressed in various types of cancer, including breast cancer, lung adenocarcinoma, and colorectal adenocarcinoma, compared to normal tissue. This overexpression has been associated with tumor growth, metastasis, and poor prognosis in certain cancers.

These findings suggest that POLR3E may serve as a potential biomarker for these cancer types and could be further investigated as a therapeutic target for the development of cancer treatments.


  results_df = results_df.append(processed_info, ignore_index=True)


The variant profile of tissues with significant differential expression for the gene POLR3E can be complex and multi-faceted. To start, we should consider both germline and somatic variations in these tissues.

1. Germline Variations:
   - Single Nucleotide Polymorphisms (SNPs): Common variations in the DNA sequence of the POLR3E gene can lead to different expression levels across tissues. By associating differential expression with specific SNPs in the gene's regulatory regions, we can better understand the genetic basis for this differential expression.

   - Copy Number Variations (CNVs): Amplifications or deletions in the number of copies of the POLR3E gene may lead to altered expression levels. This can be identified through techniques such as array comparative genomic hybridization (aCGH) and next-generation sequencing (NGS).

2. Somatic Variations:
   - Somatic Mutations: Acquired mutations in the POLR3E gene within specific tissues can influence gene expression. These mutations

  results_df = results_df.append(processed_info, ignore_index=True)


Yes, mutations in the gene POLR3E can occur at specific regions of the protein. POLR3E encodes one of the subunits of RNA polymerase III, which is involved in the synthesis of small RNAs. Mutations in POLR3E have been associated with a spectrum of neurogenetic disorders. 

Studies have reported mutations in various regions of the POLR3E gene, including the catalytic core, the N-terminal, and the C-terminal domains. These mutations can affect the function of the RNA polymerase III complex, leading to impaired RNA synthesis and subsequent disease phenotypes.

Understanding the specific regions where mutations commonly occur can provide valuable insights into the molecular mechanisms underlying the disease and may offer potential targets for drug discovery and therapeutic interventions.


  results_df = results_df.append(processed_info, ignore_index=True)


I'm sorry, but I cannot access specific databases or provide real-time information on the pathogenicity of mutations in the POLR3E gene. However, there are publicly available databases and bioinformatics tools that can provide information on the pathogenicity of specific mutations in genes. I recommend consulting resources like ClinVar, dbSNP, or Ensembl to gather this kind of information. If you have specific mutation data, you can also use bioinformatics tools such as SIFT, PolyPhen, or MutationTaster to predict the potential impact of mutations on protein function.


  results_df = results_df.append(processed_info, ignore_index=True)


As an expert in computational biology and genomics, I can tell you that POLR3E is a gene that encodes for a subunit of RNA polymerase III, which is responsible for transcribing small non-coding RNAs. Mutations in POLR3E have been associated with a rare autosomal recessive disorder called hypomyelinating leukodystrophy. However, the relationship between POLR3E mutations or copy number alterations and cancer progression or survival is not well-documented in the current literature.

To investigate the pathogenicity scores of mutations or copy number profiles associated with cancer progression and survival for the gene POLR3E, one could perform a comprehensive literature review, analyze large cancer genomics databases such as TCGA (The Cancer Genome Atlas) or cBioPortal, and potentially conduct computational analyses to assess the impact of POLR3E mutations or copy number alterations on cancer phenotypes. If such data is not readily available, it may be necessary to collaborate with other 

  results_df = results_df.append(processed_info, ignore_index=True)


The gene POLR3E encodes a subunit of RNA polymerase III, which is responsible for transcribing small, non-coding RNAs essential for various cellular functions. Mutations in POLR3E have been associated with some neurological and developmental disorders, including hypomyelinating leukodystrophy and 4H leukodystrophy. Therefore, therapeutic areas in focus for POLR3E may include neurological and developmental disorders, particularly those involving myelination and white matter abnormalities. Additionally, targeting the transcriptional machinery involving POLR3E may be a potential avenue for drug discovery in these conditions.


  results_df = results_df.append(processed_info, ignore_index=True)


I'm sorry, I don't have the specific information about clinical trials for the gene POLR3E at the moment. However, clinical trial information can change frequently as new trials are initiated and existing trials progress or are completed. I would recommend checking clinical trial registries such as ClinicalTrials.gov or contacting pharmaceutical companies directly for the most up-to-date information on clinical trials related to the gene POLR3E.


  results_df = results_df.append(processed_info, ignore_index=True)


As of the latest information available, there are no known drugs in the market specifically indicated for the treatment of POLR3E gene-related conditions. POLR3E is a gene associated with the RNA polymerase III enzyme complex, and mutations in this gene can lead to hypomyelinating leukodystrophy, a rare genetic disorder affecting the central nervous system.

However, it's important to note that this information may change over time as new drugs are developed and approved. I would recommend consulting with a medical professional or a genetic counselor for the most up-to-date information on potential treatments for conditions associated with the POLR3E gene.


  results_df = results_df.append(processed_info, ignore_index=True)


First, let's clarify the role of POLR3E. POLR3E encodes a subunit of RNA polymerase III, which is responsible for the transcription of small, non-coding RNAs. Although POLR3E mutations are associated with hypomyelinating leukodystrophy, there is limited information available about specific drug targets related to this gene.

Given the limited information available, it's challenging to identify first-in-class opportunities for targeting POLR3E directly. However, considering its involvement in transcriptional regulation, research on small molecule inhibitors or modulators of RNA polymerase III activity may offer potential opportunities for drug discovery in the context of POLR3E-related disorders.

Moreover, exploring the downstream pathways affected by POLR3E mutations may potentially reveal novel targets or therapeutic strategies. For a comprehensive understanding of drug discovery opportunities related to POLR3E, further research and investigation are necessary.


  results_df = results_df.append(processed_info, ignore_index=True)


Determining which genes or proteins to target for a new therapeutic typically involves a comprehensive analysis of the disease pathology, biological pathways, and the underlying genetic and molecular mechanisms.

1. Disease Understanding: First, it's important to thoroughly understand the disease at the molecular level. This involves studying the genetic, environmental, and lifestyle factors contributing to the disease, as well as the specific cellular and molecular processes involved.

2. Target Identification: Next, potential therapeutic targets are identified by studying the genes, proteins, or pathways that play key roles in the disease. This can involve using various omics technologies (genomics, transcriptomics, proteomics, etc.) to analyze gene expression patterns, protein interactions, and genetic variations associated with the disease.

3. Validation: Once potential targets are identified, they need to be validated to ensure their relevance to the disease and their potential a

  results_df = results_df.append(processed_info, ignore_index=True)


There are several methods that can be used to validate a potential biomarker for a specific disease. These methods include:

1. Retrospective studies: These involve analyzing stored samples from patients with the disease to determine if the biomarker is consistently associated with the disease.

2. Prospective studies: These involve collecting new samples from patients with the disease and following them over time to determine if the biomarker is predictive of disease progression or treatment response.

3. Validation in large, diverse patient populations: It is important to validate a biomarker in large and diverse patient populations to ensure that it is applicable across different demographics and disease subtypes.

4. Comparison with existing biomarkers: It is important to compare the new biomarker with existing biomarkers to determine if it provides additional value in diagnosing or monitoring the disease.

5. Mechanistic studies: These involve studying the biological mechanisms un

  results_df = results_df.append(processed_info, ignore_index=True)


Pathway analysis plays a crucial role in drug discovery by helping researchers understand the intricate biological pathways and networks involved in disease processes. By integrating pathway analysis into drug discovery, researchers can uncover potential drug targets, elucidate disease mechanisms, and identify new biomarkers for disease diagnosis and prognosis. Here's how pathway analysis can be integrated into drug discovery:

1. Target Identification: Pathway analysis can help researchers identify key biological pathways that are dysregulated in a disease state. By examining gene expression data or proteomic profiles, researchers can pinpoint specific genes or proteins within these pathways that may serve as potential drug targets. This information is invaluable for guiding the development of targeted therapies.

2. Mechanism of Action: Understanding the intricate signaling pathways and molecular networks involved in disease progression is essential for developing effective drugs. Pa

  results_df = results_df.append(processed_info, ignore_index=True)


Phenotypic screening and genotypic screening are two common approaches used in drug discovery. Genotypic screening involves the use of genomic and genetic information to identify potential drug targets, while phenotypic screening involves screening for compounds that can directly affect a specific cell or tissue phenotype, typically without prior knowledge of the target.

Genotypic screening focuses on identifying specific molecular targets that are associated with a disease or condition, usually based on genetic and genomic data, such as DNA sequencing information or gene expression profiles. This approach allows for a more targeted and rational selection of potential drug targets, as it directly identifies the underlying biological pathways or molecules that are involved in a particular disease.

On the other hand, phenotypic screening offers a broader and more unbiased approach to identifying potential drug candidates. It involves screening a large number of compounds for their abil

  results_df = results_df.append(processed_info, ignore_index=True)


Translating in vitro findings to in vivo models presents several challenges. Some of the main challenges include:

1. Complexity of in vivo systems: In vitro models often oversimplify the complexity of in vivo systems, including the interactions between different cell types, tissues, and organs. In vivo models may also involve systemic responses and feedback mechanisms that are not captured in in vitro experiments.

2. Pharmacokinetics and pharmacodynamics: In vitro models may not accurately reflect the absorption, distribution, metabolism, and excretion (ADME) of a drug in vivo, which can affect the drug's efficacy and safety profile. Understanding the pharmacokinetics and pharmacodynamics of a drug in vivo is crucial for successful translation.

3. Species differences: In vitro findings in one species may not directly translate to another species due to differences in physiology, metabolism, and genetic makeup. This is particularly relevant when translating findings from preclinical 

  results_df = results_df.append(processed_info, ignore_index=True)


Structural biology plays a key role in aiding the design of new drugs by providing detailed insights into the three-dimensional structures of biological molecules, such as proteins and nucleic acids. These insights are crucial for understanding the molecular mechanisms of diseases and for designing targeted drugs to interact with specific molecular targets.

One of the primary ways in which structural biology contributes to drug design is through the determination of high-resolution structures of drug targets, such as enzymes, receptors, and other biomolecules involved in disease processes. For example, X-ray crystallography, NMR spectroscopy, and cryo-electron microscopy are powerful techniques used to elucidate the atomic arrangement of these biomolecules, providing information about their active sites, binding pockets, and conformational changes.

With this detailed structural information, computational methods such as molecular modeling and virtual screening can be employed to iden

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E is a gene that encodes a subunit of RNA polymerase III (Pol III), which is responsible for transcribing small non-coding RNAs such as tRNAs and 5S rRNA. POLR3E is essential for the assembly and activity of Pol III, and it specifically interacts with other subunits to form the catalytic core of the enzyme. Pol III transcribes genes involved in protein synthesis and is critical for cell growth and proliferation.

The mechanisms of POLR3E involve its participation in the formation of the Pol III complex, where it helps in the accurate initiation and elongation of RNA synthesis. It also plays a role in recognizing and binding to specific promoter sequences for Pol III-transcribed genes.

From a classification perspective, POLR3E is categorized as a protein-coding gene, and mutations or dysregulation of this gene have been associated with hypomyelinating leukodystrophy, a neurological disorder characterized by abnormal development of myelin in the brain.

Overall, POLR3E is a crucial

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E, also known as RNA polymerase III subunit E, plays a critical role in transcription by RNA polymerase III. While direct involvement of POLR3E in cancer is not well established, dysregulation of RNA polymerase III transcription has been implicated in carcinogenesis. RNA polymerase III transcribes genes encoding small non-coding RNAs, such as tRNAs and 5S rRNA, which are essential for protein synthesis and cell growth.

Aberrant expression of RNA polymerase III-transcribed genes has been observed in various cancer types, leading to increased protein synthesis and cell proliferation. Additionally, mutations or dysregulation of POLR3E may indirectly contribute to cancer development by disrupting normal RNA polymerase III function. It's also worth noting that POLR3E may interact with other genes involved in transcription regulation and cellular growth, potentially influencing cancer-related pathways.

Further studies are needed to comprehensively understand the direct and indirect ro

  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E (RNA polymerase III subunit E) is not generally considered an oncogene. It is a component of RNA polymerase III, which is responsible for transcribing small non-coding RNAs, such as transfer RNAs and 5S ribosomal RNA. There is no clear evidence to suggest that POLR3E plays a direct role in promoting cancer development or progression.

However, it's important to note that the role of individual genes in oncogenesis is complex, and their involvement in cancer can depend on the specific context and cellular environment. Therefore, to fully understand the potential oncogenic nature of a gene, comprehensive experimental studies and analyses would be necessary.


  results_df = results_df.append(processed_info, ignore_index=True)


POLR3E, which encodes for the RNA polymerase III subunit E, has been associated with various cancer indications and types. Studies have suggested its involvement in breast cancer, esophageal squamous cell carcinoma, and ovarian cancer. Additionally, aberrant expression of POLR3E has been implicated in lung adenocarcinoma and colorectal cancer. These associations indicate the potential significance of POLR3E in cancer biology and its potential as a therapeutic target in the respective cancer types.


  results_df = results_df.append(processed_info, ignore_index=True)


I can provide you with information on the known interactions of the gene POLR3E from publicly available databases such as STRING (Search Tool for the Retrieval of Interacting Genes/Proteins) or GeneMANIA. These databases contain protein-protein interaction networks and functional association networks for genes.

For example, in the STRING database, you can search for the protein encoded by POLR3E and retrieve a network of known interactions with other proteins. The network may include direct physical interactions, as well as functional associations derived from various sources such as curated databases, experimental data, and predicted interactions.

Similarly, GeneMANIA provides a platform to explore protein-protein interactions, co-expression, pathways, and protein domains associated with POLR3E.

By accessing these databases, you can gain insights into the molecular mechanisms and interactions involving POLR3E, which can be valuable for drug discovery and target identification.

Ple

  results_df = results_df.append(processed_info, ignore_index=True)


Sure! Here are some key publications related to the gene POLR3E:

1. "POLR3E mutation causes endosteal hyperostosis with renal tubular acidosis" - This study identified POLR3E mutations in patients with endosteal hyperostosis and renal tubular acidosis. [Link to publication](https://www.ncbi.nlm.nih.gov/pubmed/29799030)

2. "A mutation in the gene encoding the RNA polymerase III subunit POLR3A causes an unusual syndrome with thick nail plates, onychogryphosis, and keratoderma" - This publication describes the identification of a mutation in POLR3E associated with a rare syndrome characterized by nail and skin abnormalities. [Link to publication](https://www.ncbi.nlm.nih.gov/pubmed/25111689)

3. "Mutations in POLR3A and POLR3B are a major cause of hypomyelinating leukodystrophies with or without dental abnormalities and/or hypogonadotropic hypogonadism" - This study identifies POLR3E mutations as a cause of hypomyelinating leukodystrophies with additional clinical manifestations. [Link 

  results_df = results_df.append(processed_info, ignore_index=True)


Here are some examples of pseudokinase genes:

1. Pseudokinase 3 (PSKH3)
2. Pseudokinase domain containing, ERKF (PGF)
3. Tribbles pseudokinase 2 (TRIB2)
4. Pseudokinase 2 (PSKH2)
5. Pseudokinase 6 (PSKH6)

Please note that this list is not exhaustive, and there are many more pseudokinase genes in the human genome. Let me know if you need more information or specific pseudokinase genes.


  results_df = results_df.append(processed_info, ignore_index=True)


One strategy to use CRISPR-Cas9 for editing a gene implicated in Alzheimer's disease would be to target and modify the gene associated with the production of amyloid beta (Aβ) protein, such as the amyloid precursor protein (APP) gene or the presenilin 1 (PSEN1) and presenilin 2 (PSEN2) genes.

1. Designing sgRNAs: Design specific guide RNAs (sgRNAs) to target the desired genomic loci within the APP, PSEN1, or PSEN2 genes. These sgRNAs should be designed to ensure high specificity and efficiency in targeting the gene of interest.

2. Delivery of CRISPR-Cas9 components: Deliver the Cas9 protein and the designed sgRNAs into the target cells, such as neurons or neural stem cells. This can be achieved through viral vectors, lipid nanoparticles, electroporation, or other appropriate delivery methods.

3. Genome editing: Allow the CRISPR-Cas9 system to induce double-stranded breaks at the targeted genomic loci. The cellular repair machinery may then introduce specific modifications, such as g

  results_df = results_df.append(processed_info, ignore_index=True)


To design siRNA sequences to target and knock down a specific oncogene, we need to follow a few steps:

1. Identifying the target oncogene: First, we need to identify the specific oncogene that we want to target. Let's assume we are targeting the oncogene KRAS as an example.

2. Designing siRNA sequences: We can use bioinformatics tools such as siRNA design software to design siRNA sequences that specifically target the mRNA of the KRAS oncogene. The siRNA sequences should be designed to be specific to the target gene and minimize off-target effects.

3. Validating the siRNA sequences: Once we have designed the siRNA sequences, we need to validate them using tools such as BLAST to ensure they do not have significant homology to other genes in the genome and won't cause off-target effects.

4. Testing the siRNA sequences: After validation, the designed siRNA sequences can be synthesized and tested in cell culture or animal models to determine their efficacy in knocking down the target o

  results_df = results_df.append(processed_info, ignore_index=True)


Certainly! To design a high-throughput screening (HTS) assay for identifying small molecule inhibitors of the novel protein target, we need to consider the characteristics of the target and the available screening techniques.

1. Assay Design:
   - Select an appropriate assay format based on the function of the protein target. For example, if the target is an enzyme, an enzymatic assay can be used. If it is a transcription factor, a reporter gene assay might be suitable.
   - Since the protein is believed to be involved in cancer cell proliferation, the assay could involve measuring the effect of small molecules on cell viability, proliferation, or a specific pathway associated with cancer.

2. Target Protein Assay Development:
   - If the protein target is an enzyme, develop a coupled enzymatic assay or a direct enzyme activity assay. For transcription factors, a reporter gene assay can be developed.
   - For a pathway-based assay, determine the key components of the pathway and desig

  results_df = results_df.append(processed_info, ignore_index=True)


To develop an antibody-drug conjugate (ADC) targeting a specific antigen overexpressed in a type of breast cancer, a multifaceted approach integrating genomics, molecular biology, and drug development strategies can be employed.

1. Identification of the Target Antigen:
   - Utilize genomic and transcriptomic data from breast cancer cell lines and patient samples to identify the specific antigen that is overexpressed.
   - Perform differential gene expression analysis and examine genomic alterations to pinpoint the antigen that is consistently overexpressed in the target breast cancer subtype.

2. Antibody Development:
   - Design and produce monoclonal antibodies specifically targeting the identified antigen. This can involve using phage display libraries, hybridoma technology, or recombinant antibody engineering techniques.
   - Characterize the specificity, affinity, and internalization properties of the generated antibodies to ensure their suitability for ADC development.

3. Paylo

  results_df = results_df.append(processed_info, ignore_index=True)


Sure, here's an outline for a pharmacokinetic study design for a new oral antidiabetic drug:

1. Study Objective: The primary objective of the study is to assess the pharmacokinetics of the new oral antidiabetic drug in healthy volunteers or patients with diabetes.

2. Study Population: The study will include healthy volunteers or patients with type 2 diabetes mellitus who meet the inclusion and exclusion criteria. The sample size should be large enough to provide statistically meaningful results.

3. Study Design: The study will be a single-center, open-label, crossover design with multiple doses. The participants will be randomly assigned to receive the new antidiabetic drug and a comparator drug (either placebo or an existing antidiabetic drug) in a randomized sequence with a washout period between doses.

4. Dose Selection: The study will evaluate at least two different doses of the new antidiabetic drug to assess dose-proportionality and determine the optimal dose for further clin

  results_df = results_df.append(processed_info, ignore_index=True)


Building a chemogenomics model to predict drug-target interactions involves several steps. Here's a high-level overview of the process:

1. Data Collection: The first step is to gather comprehensive data on chemical compounds (drugs) and target proteins. This includes information on the chemical structure of the compounds, as well as their biological activity and interactions with target proteins. Additionally, genomic data on the target proteins, such as gene sequences and functional annotations, is essential.

2. Data Preprocessing and Integration: The collected data needs to be preprocessed and integrated to create a unified dataset. This involves standardizing data formats, resolving inconsistencies, and merging data from different sources into a single repository.

3. Feature Extraction: From the integrated dataset, relevant features need to be extracted to represent the chemical and biological properties of the compounds and target proteins. This may involve molecular descriptors

  results_df = results_df.append(processed_info, ignore_index=True)


Computational models can predict off-target effects of potential drug candidates through various approaches:

1. Ligand-Based Methods: These methods analyze the chemical and structural characteristics of the drug molecule to predict interactions with off-target proteins. Quantitative structure-activity relationship (QSAR) and pharmacophore modeling are examples of ligand-based methods.

2. Structure-Based Methods: These methods rely on the 3D structure of the drug candidate and off-target proteins to predict binding affinities and potential off-target interactions. Molecular docking and molecular dynamics simulations are commonly used in structure-based approaches.

3. Systems Biology Approaches: These methods consider the broader biological context, integrating data on drug-target interactions, signaling pathways, and gene expression to predict potential off-target effects.

4. Machine Learning and Data Mining: Computational models can be trained on large datasets of known drug-protei

  results_df = results_df.append(processed_info, ignore_index=True)


When selecting a delivery method for a new drug, several important considerations are taken into account:

1. Bioavailability: The delivery method should ensure that the drug reaches its target site in the body at an effective concentration. Factors such as absorption, distribution, metabolism, and excretion (ADME) of the drug need to be considered.

2. Target tissue or organ: The delivery method should be tailored to the specific tissue or organ where the drug needs to exert its therapeutic effect. For example, the delivery method for a drug targeting the brain would differ from one targeting the liver.

3. Patient convenience and compliance: The delivery method should be convenient and easy for the patient to use, leading to improved compliance with the prescribed treatment regimen. This could include considerations such as frequency of administration, mode of administration (oral, injectable, transdermal, etc.), and overall patient experience.

4. Pharmacokinetics and pharmacodynami

  results_df = results_df.append(processed_info, ignore_index=True)


To screen for synergistic effects between drug candidates, you can employ a variety of experimental and computational approaches. Here are some strategies that you can use to identify and characterize synergistic interactions between drug candidates:

1. High-throughput Screening (HTS): You can perform an HTS of compound libraries to identify individual drugs with potential synergistic effects. This involves testing multiple combinations of drug candidates in various cell lines or biological systems to measure their combined effects on a specific biological endpoint.

2. Combination Drug Screening: Utilize a combinatorial screening approach to systematically test all possible combinations of drug candidates at different concentrations. This can provide a comprehensive understanding of potential synergistic, additive, or antagonistic interactions between the drugs.

3. Omics-based Approaches: Employ genomics, transcriptomics, proteomics, and metabolomics techniques to analyze the molecu

  results_df = results_df.append(processed_info, ignore_index=True)


The mutations in the PAS domain of a target protein that exhibit high selectivity for prostate cancer cell lines would likely be specific to that particular protein and its interaction with cellular pathways relevant to prostate cancer. Identifying these mutations would require a comprehensive understanding of the target protein's structure-function relationship, its role in cancer biology, and specific signaling pathways involved in prostate cancer.

In general, high selectivity for prostate cancer cell lines could arise from mutations that alter the protein's binding affinity to specific cancer-associated factors, modify its enzymatic activity to favor cancer cell survival or proliferation, or affect its interaction with other proteins or cellular components involved in prostate cancer progression.

To identify such mutations, a combination of computational approaches, structural modeling, and high-throughput screening methods can be employed. Additionally, analysis of genomic data f

  results_df = results_df.append(processed_info, ignore_index=True)


The BRCA1 and BRCA2 genes are crucial for maintaining cellular homeostasis, and mutations in these genes are associated with increased risk of breast and ovarian cancers. Structural variants in these genes can have diverse effects on their function and interaction with paralogs. These effects can range from altering binding affinities with partner proteins to impacting the stability and localization of the gene products.

Specifically, several structural variants in the BRCA1 and BRCA2 genes have been associated with changes in their interactions with paralogs such as PALB2 (partner and localizer of BRCA2), which is a key mediator of BRCA2 function in DNA repair. These interactions are typically mediated through specific protein domains, and structural variants can directly impact the integrity of these domains, leading to changes in interaction patterns.

It is important to note that the effects of structural variants on protein-protein interactions can be context-dependent and may re

  results_df = results_df.append(processed_info, ignore_index=True)


I don't have direct access to the NCBI GEO database or GSE249645, but I can guide you on how to approach this. The gene set enrichment profile in a specific experiment like GSE249645 can provide valuable insights into the biological pathways or processes that are altered under the experimental conditions. To find tissues with gene expression profiles similar to those in the experiment:

1. Retrieve the gene set enrichment results from the GSE249645 dataset. This will give you a list of enriched gene sets or pathways associated with the experimental conditions.

2. Correlate the enriched gene sets with tissue-specific gene expression profiles from databases like GTEx (Genotype-Tissue Expression) or TCGA (The Cancer Genome Atlas). These databases provide comprehensive gene expression data across different tissues and can be used to identify tissues with similar gene expression patterns.

3. Use computational tools such as gene set enrichment analysis (GSEA) or other pathway analysis meth

  results_df = results_df.append(processed_info, ignore_index=True)


To identify protein domains enriched for pathogenic mutations in breast cancer patients, you can perform an analysis using various bioinformatics tools and databases. One approach is to use tools such as InterPro or Pfam to identify protein domains within the target protein sequence. Once the protein domains are identified, you can then query databases like ClinVar or COSMIC to determine if mutations within these domains are associated with pathogenic effects in breast cancer patients.

Additionally, you can also utilize computational prediction tools to assess the potential impact of mutations on protein function, such as SIFT, PolyPhen, or CADD. Integrating the results from these analyses can provide insights into the protein domains enriched for pathogenic mutations in breast cancer patients.

If you have specific target proteins and mutation data, I could assist you in performing this analysis in more detail.


  results_df = results_df.append(processed_info, ignore_index=True)


In [7]:
import re
import pandas as pd
from difflib import SequenceMatcher
import json
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

INSTRUCTION = config['instructions']
F_NAME = config["name"]

def clean_text(text):
    """
    Remove non-ASCII characters from the text.
    """
    return ''.join(char for char in text if char.isascii())

def create_combined_csv(original_csv_path, interim_csv_path, combined_csv_path):
    # Read the original and interim data
    original_data = pd.read_excel(original_csv_path) #, encoding='utf-8-sig'
    interim_data = pd.read_excel(interim_csv_path)

    # Combine the data
    combined_data = pd.concat([original_data, interim_data], ignore_index=True)

    # Save the combined data to a new CSV file
    combined_data.to_excel(combined_csv_path, index=False)

def merge_on_contains(big_df, small_df, big_col, small_col):
    # Lowercase and strip whitespace for more effective matching
    big_df[big_col] = big_df[big_col].str.lower().str.strip()
    small_df[small_col] = small_df[small_col].str.lower().str.strip()

    # Check if 'category' column exists in small_df
    if 'category' in small_df.columns:
        # Create a new column for the merged category in big_df
        big_df['category'] = ''

        # Iterate over the small dataframe and update the category in the big dataframe
        for _, row in small_df.iterrows():
            contains_mask = big_df[big_col].str.contains(row[small_col])
            big_df.loc[contains_mask, 'category'] = row['category']
    else:
        # Handle the case when 'category' column does not exist
        # For example, you can set a default category or leave it as it is
        big_df['category'] = 'default_category'  # or any other handling logic

    return big_df

# Load the files
questions_file_path = 'files/questions_dynamic.xlsx' 
llmresults_file_path = f'files/{F_NAME}_results_grouped_by_model_dynamic.xlsx'
gpt4results_csv_path = f'files/{F_NAME}_results_gpt4_dynamic.xlsx'
results_file_path = f'files/{F_NAME}_allresults_grouped_by_model_dynamic.xlsx'

create_combined_csv(llmresults_file_path, gpt4results_csv_path, results_file_path)

# Reading the files
questions_df = pd.read_excel(questions_file_path)
results_df = pd.read_excel(results_file_path)

# Ensure the total number of questions in results_grouped_by_model.xlsx is a multiple of the number in questions.xlsx
if len(results_df) % len(questions_df) != 0:
    print(len(results_df))
    print(len(questions_df))
    raise ValueError("The total number of questions in results_grouped_by_model.xlsx must be a multiple of the number in questions.xlsx.")

# Replace questions in results_grouped_df with those from questions_df
num_repetitions = len(results_df) // len(questions_df)
repeated_questions = pd.concat([questions_df['Question']] * num_repetitions, ignore_index=True)
results_df['Question'] = repeated_questions

# Save the modified DataFrame to a new Excel file
results_df.to_excel(results_file_path, index=False)  # Replace with your desired path

# Applying the merge_on_contains function
merged_df = merge_on_contains(results_df, questions_df, 'Question', 'Question')

# CLEAN DATA
# # Define a regular expression pattern to match unwanted characters and sequences
# pattern = r'(\\n|\\|")'
# # Use the pattern to replace matched characters with a single space
# merged_df['Question'] = merged_df['Question'].str.replace(pattern, ' ', regex=True)
# # Apply the clean_text function to remove non-ASCII characters
# merged_df['Question'] = merged_df['Question'].apply(clean_text)
# # Replace multiple white spaces with a single space
# merged_df['Question'] = merged_df['Question'].str.replace(r'\s+', ' ', regex=True)
# # Remove leading and trailing whitespaces
# merged_df['Question'] = merged_df['Question'].str.strip()
# Pivoting the data
pivoted_data = merged_df.pivot(index=['Question', 'category'], columns='Model', values='Response')

# Resetting index to make 'Question' and 'category' columns again
pivoted_data.reset_index(inplace=True)

pivoted_data.to_excel(f'files/{F_NAME}_results_grouped_by_question_dynamic.xlsx', index=False)


  contains_mask = big_df[big_col].str.contains(row[small_col])
