In [3]:
# RAG with our files, simple example from Llama Index
from llama_index import VectorStoreIndex, SimpleDirectoryReader
import pandas as pd

documents = SimpleDirectoryReader("files/Papers_FullText/").load_data()
database = SimpleDirectoryReader("files/db/").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

file_path = 'files/questions_rag.xlsx'
questions_df = pd.read_excel(file_path, sheet_name='Sheet1')

responses = []
for index, row in questions_df.iterrows():
    question = row['Questions']
    print(f"Question {index + 1}: {row['Questions']}")
    response = query_engine.query(question)
    responses.append(response)
    print(f"Response is: {response}")

Question 1: Define synthetic lethality
Response is: Synthetic lethality refers to a genetic phenomenon where the simultaneous mutation of two genes leads to cell death, while mutation of either gene alone is compatible with cell viability. In other words, the combination of mutations in both genes is lethal, but each individual mutation is not. This concept has been observed in various organisms, including humans, and has implications in cancer research and therapy. It suggests that targeting the products of genes that are synthetic lethal to cancer-causing mutations could selectively kill cancer cells while sparing normal cells.
Question 2: Synthetic lethality was discovered in which model organism?
Response is: Synthetic lethality was discovered in the fruit fly, Drosophila melanogaster.
Question 3: PARP gene expression shows synthetic lethal relationship with mutations in which genes?
Response is: The context information does not provide specific information about which genes PARP g

In [2]:
# Create an index via OpenAI embeddings
import os
import re
import numpy as np
from PyPDF2 import PdfReader
import faiss
import nltk
from openai import OpenAI
from config import config

# Download NLTK punkt tokenizer models
nltk.download('punkt')
client = OpenAI()

def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text).strip()

def get_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

def get_embedding(sentence):
    response = client.embeddings.create(
        input=sentence,
        model="text-embedding-3-small"
    )
    return np.array(response.data[0].embedding, dtype='float32')

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, chunk_size=256):
    """
    Splits the text into smaller chunks, each with a maximum size of chunk_size tokens.
    """
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        yield ' '.join(tokens[i:i + chunk_size])

def process_text(text, faiss_index, id_map):
    for chunk in chunk_text(clean_text(text)):
        sentences = get_sentences(chunk)
        for sentence in sentences:
            embedding = get_embedding(sentence)
            idx = faiss_index.ntotal
            faiss_index.add(np.array([embedding]))
            id_map[idx] = sentence

def process_folder(folder_path, faiss_index, id_map):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            text = read_pdf(file_path)
        elif file_name.endswith('.txt'):
            text = read_text_file(file_path)
        else:
            continue  # Skip other file formats
        process_text(text, faiss_index, id_map)

def save_faiss_index(faiss_index, file_name):
    faiss.write_index(faiss_index, file_name)

def load_faiss_index(file_name):
    return faiss.read_index(file_name)

def search_index(query_embedding, faiss_index, id_map, k):
    distances, indices = faiss_index.search(np.array([query_embedding]), k)
    return [(id_map[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

# Initialize FAISS index and ID map
dimension = 1536  # Adjust based on your model's output
faiss_index = faiss.IndexFlatL2(dimension)
id_map = {}

# Process files and index embeddings
folder_path = 'files/Papers_FullText'
process_folder(folder_path, faiss_index, id_map)

# Save the index for later use
save_faiss_index(faiss_index, 'utils/faiss_index.idx')

[nltk_data] Downloading package punkt to /Users/rohit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Example usage: Load the index and search
faiss_index = load_faiss_index('utils/faiss_index.idx')
file_path = 'files/questions_rag.xlsx'
questions_df = pd.read_excel(file_path, sheet_name='Sheet1')

results = []
for index, row in questions_df.iterrows():
    question = row['Questions']
    print(f"Question {index + 1}: {row['Questions']}")
    query_embedding = get_embedding(question)
    result = search_index(query_embedding, faiss_index, id_map,3)
    print(result)
    results.append(result)
    print(f"Response is: {result}")


Question 1: Define synthetic lethality
[('to identify synthetic lethal inter actions For these reasons most largescale synthetic lethal genetic interaction screens have been carried out in budding yeast or fission yeast as technologies that facilitate the highthroughput generation and analysis of double mutants under defined laboratory conditions are readily available Advances in RNA interference RNAi and more recently CRISPR technology have now made it possible to carry out largescale unbiased synthetic lethality screening directly in human cell cultureMichael Smith Laboratories University of British Columbia 2185 East Mall Vancouver British Columbia V6T 1Z4 Canada Correspondence to P H hietermslubcca doi101038nrg201747 Published online 26 Jun 2017Synthetic lethality A synthetic lethal interaction occurs between two genes when a perturbation a mutation RNA interference knockdown or inhibition that affects either gene alone is viable but the perturbation of both genes simultaneously is