In [1]:
import replicate
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv()
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

INSTRUCTION = config['instructions']

# # Load the CSV file
df = pd.read_csv('questions.csv')

# DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Question', 'Response'])

models = {
    # "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    # "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    "llama2-70b" : "meta/llama-2-70b-chat",
    "openhermes2": "antoinelyset/openhermes-2.5-mistral-7b:d7ccd25700fb11c1787c25b580ac8d715d2b677202fe54b77f9b4a1eb7d73e2b",
    # "mixtral-32kseqlen": "nateraw/mixtral-8x7b-32kseqlen:db11a6311e4ce1fc6b3904d944f6052b17dcd1236ea976690e4943c4e807cd9b",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e"
    }

prompt_for_qwen="""<|im_start|>system\n{INSTRUCTION}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": f"{INSTRUCTION}"
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        qn = row['Question']
        question = json.dumps(qn)

        if model_key == "yi-34b":  # Yi model
            question = json.dumps(question)
            prompt = prompt_for_qwen.format(question=question)
        if model_key == "qwen-14b":  # Qwen model
            question = json.dumps(question)
            prompt = prompt_for_qwen.format(question=question)
        elif model_key == "openhermes2":  # Hermes model
            question = json.dumps(question)
            prompt = prompt_for_hermes.format(question=question)
        else:
            prompt = f"{INSTRUCTION}. {question}"
        try:
            print(prompt)
            output = replicate.run(
                model_value,
                input={
                  "debug": False,
                #   "top_k": 50,
                  "top_p": 0.9,
                  "prompt": prompt,
                  "temperature": 0.5,
                  "max_new_tokens": 500,
                  "min_new_tokens": -1
                }
            )
            response = ""
            response_parts = []  # Initialize an empty list to collect string representations

            for item in output:
                item_str = str(item)  # Convert item to string
                response += item_str if len(item_str) == 1 else f" {item_str}"
                
            response = response.strip()

        except Exception as e:
            response = f"Error: {e}"

        new_row = pd.DataFrame({'Model': [model_key], 'Question': [qn], 'Response': [response]})
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        if index % 1 == 0:  # Save every 10 questions, adjust as needed
            results_df.to_csv('results_grouped_by_model.csv', index=False)

results_df.to_csv('results_grouped_by_model.csv', index=False)

"Which cell lines have high dependency for the target of interest for the gene POLR3E?"
"Which indications are the dependent cell lines for the gene POLR3E enriched in? Are there lineages with high selectivity?"
"Is there a concordance in the variant profile seen in cell lines with that of patients for the gene POLR3E?"
"Describe associations between expression and variant profiles for the gene POLR3E."
"Have cell lines been used in other studies for the target of interest of the gene POLR3E? If experimental data is available on NCBI GEO, provide those links or analysis within the platform."
"For the gene POLR3E, identify indications with significant differential expression in cancer vs normal tissue. Is it overexpressed or underexpressed in tumor tissue?"
"Describe the variant profile of tissues where there is significant differential expression for the gene POLR3E. Include details on copy number variations, mutations, etc."
"Are there specific regions of the protein of the gene POLR3