In [10]:
# RAG with our files, simple example from Llama Index
from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("files/Papers_FullText/").load_data()
database = SimpleDirectoryReader("files/db/").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("How does synethetic lethality result from pseudokinases and impact on oncogenes?")
print(response)

Synthetic lethality can result from the interaction between pseudokinases and oncogenes. Pseudokinases are a type of protein that resemble kinases but lack the ability to catalyze phosphorylation reactions. In the context of cancer, pseudokinases can interact with oncogenes to create a synthetic lethal relationship. This means that the presence of mutations in both the pseudokinase and the oncogene is lethal to the cancer cells, while mutations in either one alone are compatible with cell viability. This interaction can be exploited for therapeutic purposes, as targeting the pseudokinase or the oncogene alone may not be effective, but targeting both can selectively kill cancer cells.


In [3]:
# Create an index via OpenAI embeddings
import os
import re
import numpy as np
from PyPDF2 import PdfReader
import faiss
import nltk
from openai import OpenAI

# Download NLTK punkt tokenizer models
nltk.download('punkt')
client = OpenAI()

def clean_text(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text).strip()

def get_sentences(text):
    return nltk.tokenize.sent_tokenize(text)

def get_embedding(sentence):
    response = client.embeddings.create(
        input=sentence,
        model="text-embedding-3-small"
    )
    return np.array(response.data[0].embedding, dtype='float32')

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, chunk_size=256):
    """
    Splits the text into smaller chunks, each with a maximum size of chunk_size tokens.
    """
    tokens = text.split()
    for i in range(0, len(tokens), chunk_size):
        yield ' '.join(tokens[i:i + chunk_size])

def process_text(text, faiss_index, id_map):
    for chunk in chunk_text(clean_text(text)):
        sentences = get_sentences(chunk)
        for sentence in sentences:
            embedding = get_embedding(sentence)
            idx = faiss_index.ntotal
            faiss_index.add(np.array([embedding]))
            id_map[idx] = sentence

def process_folder(folder_path, faiss_index, id_map):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            text = read_pdf(file_path)
        elif file_name.endswith('.txt'):
            text = read_text_file(file_path)
        else:
            continue  # Skip other file formats
        process_text(text, faiss_index, id_map)

def save_faiss_index(faiss_index, file_name):
    faiss.write_index(faiss_index, file_name)

def load_faiss_index(file_name):
    return faiss.read_index(file_name)

def search_index(query_embedding, faiss_index, id_map, k):
    distances, indices = faiss_index.search(np.array([query_embedding]), k)
    return [(id_map[idx], distances[0][i]) for i, idx in enumerate(indices[0])]

# Initialize FAISS index and ID map
dimension = 1536  # Adjust based on your model's output
faiss_index = faiss.IndexFlatL2(dimension)
id_map = {}

# Process files and index embeddings
folder_path = 'files/Papers_FullText'
process_folder(folder_path, faiss_index, id_map)

# Save the index for later use
save_faiss_index(faiss_index, 'faiss_index.idx')

[nltk_data] Downloading package punkt to /Users/rohit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Example usage: Load the index and search
faiss_index = load_faiss_index('faiss_index.idx')

query_sentence = "How does synethetic lethality result from pseudokinases and impact on oncogenes?"
query_embedding = get_embedding(query_sentence)
results = search_index(query_embedding, faiss_index, id_map,3)
print(results)


[('and siRNA sensitivity requires careful interpretation 43 as many inhibitors36Genetic and cellular mechanisms of oncogenesis Current Opinion in Genetics Development 2011 213441 wwwsciencedirectcomSearching for synthetic lethality in cancer Brough et al 37 Figure 2 Synthetic lethality by functional compensation aThe EGFR pathway is often active in cancer One of the downstream consequences of EGFR stimulation is the activation of AKT which drives cellular proliferation bEGFR inhibitor gefitinib sensitive cancers are unable to activate AKT and as a result die when treated with the inhibitor cIn basallike breast cancer the notch pathway is hyperactivated Notch activation compensates or buffers for EGFR inhibition by maintaining the levels of activated AKT dDual inhibition of EGFR and Notch by inhibiting the essential Notch component gsecretase results in synthetic lethality since the levels of activated AKT are depleted eA potential route for predicting further synthetic lethal interacti