In [3]:
# Example from Llama Index: RAG with Papers
from llama_index import VectorStoreIndex, SimpleDirectoryReader
import pandas as pd

documents = SimpleDirectoryReader("files/Papers_FullText/").load_data()
database = SimpleDirectoryReader("files/db/").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

file_path = 'files/questions_rag.xlsx'
questions_df = pd.read_excel(file_path, sheet_name='Sheet1')

responses = []
for index, row in questions_df.iterrows():
    question = row['Questions']
    print(f"Question {index + 1}: {row['Questions']}")
    response = query_engine.query(question)
    responses.append(response)
    print(f"Response is: {response}")

Question 1: Define synthetic lethality
Response is: Synthetic lethality refers to a genetic phenomenon where the simultaneous mutation of two genes leads to cell death, while mutation of either gene alone is compatible with cell viability. In other words, the combination of mutations in both genes is lethal, but each individual mutation is not. This concept has been observed in various organisms, including humans, and has implications in cancer research and therapy. It suggests that targeting the products of genes that are synthetic lethal to cancer-causing mutations could selectively kill cancer cells while sparing normal cells.
Question 2: Synthetic lethality was discovered in which model organism?
Response is: Synthetic lethality was discovered in the fruit fly, Drosophila melanogaster.
Question 3: PARP gene expression shows synthetic lethal relationship with mutations in which genes?
Response is: The context information does not provide specific information about which genes PARP g

In [6]:
# Create an index via OpenAI embeddings
import os
import re
import numpy as np
from PyPDF2 import PdfReader
import faiss
import nltk
from openai import OpenAI
from config import config

# Download NLTK punkt tokenizer models
nltk.download('punkt')
client = OpenAI()
CHUNK_SIZE = 128

def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text).strip()

def get_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

def get_embedding(sentence):
    response = client.embeddings.create(
        input=sentence,
        model="text-embedding-3-small"
    )
    return np.array(response.data[0].embedding, dtype='float32')

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, chunk_size=CHUNK_SIZE):
    """
    Splits the text into smaller chunks, each with a maximum size of chunk_size tokens.
    """
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        yield ' '.join(tokens[i:i + chunk_size])

def process_text(text, faiss_index, id_map):
    for chunk in chunk_text(clean_text(text)):
        sentences = get_sentences(chunk)
        for sentence in sentences:
            embedding = get_embedding(sentence)
            idx = faiss_index.ntotal
            faiss_index.add(np.array([embedding]))
            id_map[idx] = sentence

def process_folder(folder_path, faiss_index, id_map):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            text = read_pdf(file_path)
        elif file_name.endswith('.txt'):
            text = read_text_file(file_path)
        else:
            continue  # Skip other file formats
        process_text(text, faiss_index, id_map)

def save_faiss_index(faiss_index, file_name):
    faiss.write_index(faiss_index, file_name)

def load_faiss_index(file_name):
    return faiss.read_index(file_name)

def search_index(query_embedding, faiss_index, id_map, k):
    distances, indices = faiss_index.search(np.array([query_embedding]), k)
    return [(id_map[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

# Initialize FAISS index and ID map
dimension = 1536  # Adjust based on your model's output
faiss_index = faiss.IndexFlatL2(dimension)
id_map = {}

# Process files and index embeddings
folder_path = 'files/Papers_FullText'
process_folder(folder_path, faiss_index, id_map)

# Save the index for later use
save_faiss_index(faiss_index, 'utils/faiss_index.idx')

[nltk_data] Downloading package punkt to /Users/rohit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Example usage: Load the index and search
faiss_index = load_faiss_index('utils/faiss_index.idx')
file_path = 'files/questions_rag.xlsx'
questions_df = pd.read_excel(file_path, sheet_name='Sheet1')

results = []
for index, row in questions_df.iterrows():
    question = row['Question']
    print(f"Question {index + 1}: {row['Question']}")
    query_embedding = get_embedding(question)
    result = search_index(query_embedding, faiss_index, id_map,3)
    print(result)
    results.append(result)
    print(f"Response is: {result}")


Question 1: Define synthetic lethality
[('to identify synthetic lethal inter actions For these reasons most largescale synthetic lethal genetic interaction screens have been carried out in budding yeast or fission yeast as technologies that facilitate the highthroughput generation and analysis of double mutants under defined laboratory conditions are readily available Advances in RNA interference RNAi and more recently CRISPR technology have now made it possible to carry out largescale unbiased synthetic lethality screening directly in human cell cultureMichael Smith Laboratories University of British Columbia 2185 East Mall Vancouver British Columbia V6T 1Z4 Canada Correspondence to P H hietermslubcca doi101038nrg201747 Published online 26 Jun 2017Synthetic lethality A synthetic lethal interaction occurs between two genes when a perturbation a mutation RNA interference knockdown or inhibition that affects either gene alone is viable but the perturbation of both genes simultaneously is

In [16]:
# LLM checks to write a SQL query

import replicate
import pandas as pd
import json
import os
import config
import time
import importlib
importlib.reload(config)
from config import config, reset_config
from dotenv import load_dotenv
load_dotenv()
from config import config
config.set_mode("rag")

folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME

def load_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

df = pd.read_excel(config.questions)
df.to_excel(config.q_original, index=False)

df['Question'] = df['Question'].str.strip()  # Removes leading/trailing whitespace

# Check for duplicate questions
duplicates = df.duplicated(subset=['Question'], keep=False)
if duplicates.any():
    print("Duplicates found. Removing duplicates.")
    df = df.drop_duplicates(subset=['Question'], keep='first')
    df.to_excel(config.questions, index=False)
else:
    print("No duplicates found.")

# DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Question', 'Rewritten Question', 'Response', 'Latency'])

models = {
    # "qwen-14b": "nomagick/qwen-14b-chat:f9e1ed25e2073f72ff9a3f46545d909b1078e674da543e791dec79218072ae70",
    # "falcon-40b": "joehoover/falcon-40b-instruct:7d58d6bddc53c23fa451c403b2b5373b1e0fa094e4e0d1b98c3d02931aa07173",
    "yi-34b": "01-ai/yi-34b-chat:914692bbe8a8e2b91a4e44203e70d170c9c5ccc1359b283c84b0ec8d47819a46",
    "mistral-7b": "mistralai/mistral-7b-instruct-v0.2:f5701ad84de5715051cb99d550539719f8a7fbcf65e0e62a3d1eb3f94720764e",
    "llama2-70b": "meta/llama-2-70b-chat",
    # "openhermes2": "antoinelyset/openhermes-2.5-mistral-7b:d7ccd25700fb11c1787c25b580ac8d715d2b677202fe54b77f9b4a1eb7d73e2b",
    "mixtral-instruct": "mistralai/mixtral-8x7b-instruct-v0.1:2b56576fcfbe32fa0526897d8385dd3fb3d36ba6fd0dbe033c72886b81ade93e",
    # "deepseek_33bq": "kcaverly/deepseek-coder-33b-instruct-gguf:ea964345066a8868e43aca432f314822660b72e29cab6b4b904b779014fe58fd",
    }

REWRITE_QUESTION = "You are a question creator, to perform semantic search over several academic papers to get the best answer to the given query. As the first step, please write one very clear QUESTION to answer the following question. It should be succinct."
SUMMARISE_ANSWER = "Summarise the given information into a succinct response in an easy to read format, eg bullet points."

prompt_for_qwen="""<|im_start|>system\n {INSTRUCTION}. {REWRITE_QUESTION} <|im_end|>\n<|im_start|>user\n {question}<|im_end|>\n<|im_start|>assistant\n"""
prompt_for_hermes = """[
{{
  "role": "system",
  "content": "{INSTRUCTION}. {REWRITE_QUESTION}" 
}},
{{
  "role": "user",
  "content": {question}
}}
]"""

# Iterate through each model
for model_key, model_value in models.items():
    responses = []

    for index, row in df.iterrows():
        qn = row['Question']
        question = json.dumps(qn)

        if model_key == "yi-34b":  # Yi model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=REWRITE_QUESTION, question=question)
        if model_key == "qwen-14b":  # Qwen model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=REWRITE_QUESTION, question=question)
        elif model_key == "openhermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=REWRITE_QUESTION, question=question)
        else:
            prompt = f"{INSTRUCTION}. {REWRITE_QUESTION}. {question}."

        start_time = time.time()  # Record the start time

        # USING THE QUESTION IN DOC, COME UP WITH A GOOD QUERY
        try:
            print(prompt)
            output = replicate.run(
                model_value,
                input={
                  "debug": False,
                #   "top_k": 50,
                  "top_p": 0.9,
                  "prompt": prompt,
                  "temperature": 0.7,
                  "max_new_tokens": 500,
                  "min_new_tokens": -1
                }
            )
            rewritten_qn = ""
            for item in output:
                item_str = str(item)#.strip()  # Convert item to string
                rewritten_qn += item_str # if len(item_str) == 1 else f" {item_str}"

            # response = response.strip()

        except Exception as e:
            rewritten_qn = f"Error: {e}"

        query_embedding = get_embedding(rewritten_qn)
        result = search_index(query_embedding, faiss_index, id_map,3)

        # USING THE QUESTION CREATED, NOW EMBED IT AND SUMMARISE THE ANSWER
        if model_key == "yi-34b":  # Yi model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=SUMMARISE_ANSWER, question=result)
        if model_key == "qwen-14b":  # Qwen model
            prompt = prompt_for_qwen.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=SUMMARISE_ANSWER, question=result)
        elif model_key == "openhermes2":  # Hermes model
            prompt = prompt_for_hermes.format(INSTRUCTION=INSTRUCTION, REWRITE_QUESTION=SUMMARISE_ANSWER, question=result)
        else:
            prompt = f"{INSTRUCTION}. {SUMMARISE_ANSWER}. Answer as asked:  {result}."

        try:
            print(f"Rewritten question is: {rewritten_qn}")
            output = replicate.run(
                model_value,
                input={
                  "debug": False,
                #   "top_k": 50,
                  "top_p": 0.9,
                  "prompt": prompt,
                  "temperature": 0.1,
                  "max_new_tokens": 500,
                  "min_new_tokens": -1
                }
            )
            response = ""
            for item in output:
                item_str = str(item)#.strip()  # Convert item to string
                response += item_str # if len(item_str) == 1 else f" {item_str}"

            # response = response.strip()
            print(f"Response is: {response}")

        except Exception as e:
            response = f"Error: {e}"

        end_time = time.time()  # Record the end time
        latency = end_time - start_time  # Calculate latency

        new_row = pd.DataFrame({'Model': [model_key], 'Question': [qn],'Rewritten Question': [rewritten_qn], 'Response': [response], 'Latency': [latency]})
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        if index % 20 == 0:  # Save every 10 questions, adjust as needed
            results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')

results_df.to_excel(config.llmresults_file_path, index=False, sheet_name='Sheet1')

No duplicates found.
You are an exceptional computational biologist and genomics expert and know everything about drug discovery.. You are a question creator, to perform semantic search over several academic papers to get the best answer to the given query. As the first step, please write one very clear QUESTION to answer the following question. It should be succinct.. "Define synthetic lethality".
Rewritten question is: Question: How does synthetic lethality arise from genetic interactions between two or more genes?
Response is: - Synthetic lethality refers to a situation where simultaneous perturbations (such as mutations or inhibition) of two genes leads to cell death, while individual perturbation of each gene does not.
- This phenomenon can be exploited in cancer treatment by targeting tumor-specific mutations or altered gene expressions.
- Next-generation sequencing technology advancements enable the identification of numerous such mutations and alterations.
- Computational pipel

In [19]:
# GPT-4 writes a SQL query

import pandas as pd
import json
import openai
import requests
from openai import OpenAI
import time
from dotenv import load_dotenv
load_dotenv()
import os
from config import config
config.set_mode("rag")

# with open('configfile.json', 'r') as config_file:
#     config = json.load(config_file)

folder_path = 'files'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

INSTRUCTION = config.INSTRUCTION
F_NAME = config.F_NAME
GPT_MODEL = config.GPT_MODEL
INPUT_CSV_PATH = config.questions
OUTPUT_CSV_PATH = config.gpt4results_csv_path
REWRITE_QUESTION = "You are a question creator, to perform semantic search over several academic papers to get the best answer to the given query. As the first step, please write one very clear QUESTION to answer the following question. It should be succinct."
SUMMARISE_ANSWER = "Summarise the given information into a succinct response in an easy to read format, eg bullet points."

client = OpenAI()
def show_json(obj):
    print(json.loads(obj.model_dump_json()))

assistant = client.beta.assistants.create(
    name=f"{F_NAME} AI Evaluator from RAG over documents",
    instructions=INSTRUCTION,
    model=GPT_MODEL,
)
show_json(assistant)

# Utility functions
def read_csv(file_path):
    return pd.read_excel(file_path)

def process_data_for_gpt(data):
    prompts = []
    for _, row in data.iterrows():
        question = row['Question']
        prompt = f"{REWRITE_QUESTION}: {question}."
        prompts.append(prompt)
    return prompts

def submit_message_and_create_questions(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=f"{REWRITE_QUESTION}: {prompt}")
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def submit_message_and_create_run(assistant_id, prompt):
    thread = client.beta.threads.create() # If you replace this globally it appends all answers to the one before.
    query_embedding = get_embedding(prompt)
    result = search_index(query_embedding, faiss_index, id_map,3)
    client.beta.threads.messages.create(thread_id=thread.id, role="user", content=f"{SUMMARISE_ANSWER}: {prompt}")
    return client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant_id), thread

def wait_on_run_and_get_response(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(0.5)
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    return [m.content[0].text.value for m in messages if m.role == 'assistant']

def create_output_csv(data, responses, latencies, model_name, interim_csv_path):
    new_rows = []
    for question, response, latency in zip(data['Question'], responses, latencies):
        new_rows.append({'Model': model_name, 'Question': question, 'Rewritten Question': [rewritten_qn], 'Response': response, 'Latency': latency})
    new_data = pd.DataFrame(new_rows)
    new_data.to_excel(interim_csv_path, index=False)

data = read_csv(INPUT_CSV_PATH)
prompts = process_data_for_gpt(data)
ASSISTANT_ID = assistant.id

responses = []
latencies = []  # Initialize a list to store latencies

for prompt in prompts:
    start_time = time.time()  # Capture start time
    run1, thread1 = submit_message_and_create_questions(ASSISTANT_ID, prompt)
    response1 = wait_on_run_and_get_response(run1, thread1)
    run, thread = submit_message_and_create_run(ASSISTANT_ID, response1)
    response = wait_on_run_and_get_response(run, thread)
    if isinstance(response, list):
        response = ' '.join(map(str, response))
    response = response.replace("\\\\n", "\\n")
    response = response.strip()
    print(response)
    responses.append(response)

    end_time = time.time()  # Capture end time
    latency = end_time - start_time  # Calculate latency
    latencies.append(latency)  # Store latency

create_output_csv(data, responses, latencies, GPT_MODEL, OUTPUT_CSV_PATH)

{'id': 'asst_8ka84OMDNpnMxEGJoC726GSl', 'created_at': 1707774177, 'description': None, 'file_ids': [], 'instructions': 'You are an exceptional computational biologist and genomics expert and know everything about drug discovery.', 'metadata': {}, 'model': 'gpt-3.5-turbo-1106', 'name': 'galen AI Evaluator from RAG over documents', 'object': 'assistant', 'tools': []}
- Synthetic lethality is a concept in molecular biology and genetics where the combination of two non-lethal mutations leads to cell death.
- It is a potential target for drug discovery, where targeting a gene that is synthetically lethal with a cancer-specific mutation can selectively kill cancer cells.
- Understanding synthetic lethality can help in developing targeted cancer therapies.
- Synthetic lethality was first discovered in the model organism Saccharomyces cerevisiae, commonly known as baker's yeast.
- Genes that exhibit a synthetic lethal relationship with PARP gene expression when mutated are important for PARP i

In [26]:
import re
import pandas as pd
from difflib import SequenceMatcher
from config import config

config.set_mode('rag')

def clean_text(text):
    """
    Remove non-ASCII characters from the text.
    """
    return ''.join(char for char in text if char.isascii())

def create_combined_csv(original_csv_path, interim_csv_path, combined_csv_path):
    # Read the original and interim data
    original_data = pd.read_excel(original_csv_path) #, encoding='utf-8-sig'
    interim_data = pd.read_excel(interim_csv_path)

    # Combine the data
    combined_data = pd.concat([original_data, interim_data], ignore_index=True)

    # Save the combined data to a new CSV file
    combined_data.to_excel(combined_csv_path, index=False)

def merge_on_contains(big_df, small_df, big_col, small_col):
    # Lowercase and strip whitespace for more effective matching
    big_df[big_col] = big_df[big_col].str.lower().str.strip()
    small_df[small_col] = small_df[small_col].str.lower().str.strip()

    # Check if 'category' column exists in small_df
    if 'category' in small_df.columns:
        # Create a new column for the merged category in big_df
        big_df['category'] = ''

        # Iterate over the small dataframe and update the category in the big dataframe
        for _, row in small_df.iterrows():
            contains_mask = big_df[big_col].str.contains(row[small_col])
            big_df.loc[contains_mask, 'category'] = row['category']
    else:
        # Handle the case when 'category' column does not exist
        # For example, you can set a default category or leave it as it is
        big_df['category'] = 'default_category'  # or any other handling logic

    return big_df

# Load the files
questions_file_path = config.questions
llmresults_file_path = config.llmresults_file_path
gpt4results_csv_path = config.gpt4results_csv_path
results_file_path = config.results_file_path

create_combined_csv(llmresults_file_path, gpt4results_csv_path, results_file_path)

# Reading the files
questions_df = pd.read_excel(questions_file_path)
results_df = pd.read_excel(results_file_path)

# Ensure the total number of questions in results_grouped_by_model.xlsx is a multiple of the number in questions.xlsx
if len(results_df) % len(questions_df) != 0:
    print(len(results_df))
    print(len(questions_df))
    raise ValueError("The total number of questions in results_grouped_by_model.xlsx must be a multiple of the number in questions.xlsx.")

# Replace questions in results_grouped_df with those from questions_df
num_repetitions = len(results_df) // len(questions_df)
repeated_questions = pd.concat([questions_df['Question']] * num_repetitions, ignore_index=True)
results_df['Question'] = repeated_questions

# Save the modified DataFrame to a new Excel file
results_df.to_excel(results_file_path, index=False)  # Replace with your desired path

# Applying the merge_on_contains function
merged_df = merge_on_contains(results_df, questions_df, 'Question', 'Question')

pivoted_responses = merged_df.pivot_table(index=['Question'], columns='Model', values='Response', aggfunc=lambda x: ' | '.join(x))
pivoted_latencies = merged_df.pivot_table(index=['Question'], columns='Model', values='Latency', aggfunc='mean')

# Rename columns for clarity
pivoted_responses.columns = [f'{col} Response' for col in pivoted_responses.columns]
pivoted_latencies.columns = [f'{col} Latency' for col in pivoted_latencies.columns]

# Combine the pivoted DataFrames
pivoted_combined = pd.concat([pivoted_responses, pivoted_latencies], axis=1)

# Ensure the combined DataFrame maintains the original order of questions
pivoted_combined.reset_index(inplace=True)

# Save the combined DataFrame to an Excel file
pivoted_combined.to_excel(config.combined_file_path, index=False)


In [27]:
# Archive the intermediate files
import os
import glob
from config import config
directory = 'files/'
archive_directory = os.path.join(directory, '#Archive')

# Create the #Archive directory if it doesn't exist
if not os.path.exists(archive_directory):
    os.makedirs(archive_directory)

# List all files that start with F_NAME and exclude the specified files
files_to_move = [f for f in glob.glob(f"{directory}/{config.F_NAME}_*") 
                 if '_model_rankings' not in f and '_llmeval_results' not in f and 'questions' not in f and '_results_grouped_by_question_' not in f and '_allresults_grouped_by_model_' not in f]

# Move the files to the #Archive folder
for file in files_to_move:
    os.rename(file, os.path.join(archive_directory, os.path.basename(file)))
    print(f"Moved file: {file} to {archive_directory}")


Moved file: files/galen_results_grouped_by_model_rag.xlsx to files/#Archive
Moved file: files/galen_results_gpt4_rag.xlsx to files/#Archive
Moved file: files/galen_results_grouped_by_model_db.xlsx to files/#Archive
