In [3]:
import os
import requests

from openai import OpenAI
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

load_dotenv()

True

## Load the data from the text file

In [4]:
data_file = "data/study_1.txt"

In [5]:

# Step 1: Load your raw text document using TextLoader
loader = TextLoader(data_file)
loaded_doc = loader.load()

In [6]:
# Define the Text Splitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 800
)

#Create a split of the document using the text splitter
splits = text_splitter.split_documents(loaded_doc)

In [7]:
db = FAISS.from_documents(splits, OpenAIEmbeddings())

## Setup GPT-4

In [8]:
client = OpenAI()

def get_completion(message):

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": message
            }
        ],
        model="gpt-4o",
        temperature=0
    )

    print(chat_completion.choices[0].message.content)

## Search the document

In [22]:
n_chunks = 6

query = """
Extract all the modifications / treatments vs. no modification / no treatment and point out the change in median and maximum lifespan for each modification / treatment.
"""
docs = db.similarity_search(query, k=n_chunks)

prompt_context = f"{docs[0].page_content}"

print(len(prompt_context))

for i in range(1, n_chunks):
    prompt_context = prompt_context + f"\n\n{docs[i].page_content}"

1392


In [46]:
prompt_template = r"""In the following, separated by ```, you will find parts of a study on longevity. 
Extract all the modifications / treatments vs. no modification / no treatment / control group and point out the change in median and maximum lifespan for 
each modification / treatment. 
If you don't know the answer, then say that you don't know.

```
{prompt_context}
```
""".format(prompt_context=prompt_context)


formatting = """

Return the result in JSON format without any further comments. Structure the JSON in the following way: 
{
    treatment_1: { 
        median_lifespan_no_treament: x_median, 
        median_lifespan_treament: y_median, 
        maximum_lifespan_no_treatment: x_max, 
        maximum_lifespan_treatment: y_max 
    }, 
    treatment_2: { 
        median_lifespan_no_treament: x_median, 
        median_lifespan_treament: y_median, 
        maximum_lifespan_no_treatment: x_max, 
        maximum_lifespan_treatment: y_max 
    } 
}
Replace treatment_1, treatment_2, etc. by the real treatment or intervention."""


prompt_template = prompt_template + formatting

In [47]:
get_completion(prompt_template)

```json
{
    "ENU-treated Apcdel/+": {
        "median_lifespan_no_treatment": 255,
        "median_lifespan_treatment": 117,
        "maximum_lifespan_no_treatment": "unknown",
        "maximum_lifespan_treatment": "unknown"
    },
    "ENU-treated Egr1+/−, Apc del/+": {
        "median_lifespan_no_treatment": 179,
        "median_lifespan_treatment": 114,
        "maximum_lifespan_no_treatment": "unknown",
        "maximum_lifespan_treatment": "unknown"
    },
    "ENU-treated Tp53+/−, Apc del/+": {
        "median_lifespan_no_treatment": 144,
        "median_lifespan_treatment": 101,
        "maximum_lifespan_no_treatment": "unknown",
        "maximum_lifespan_treatment": "unknown"
    },
    "ENU-treated triple heterozygous (Egr1+/−, Tp53+/−, Apc del/+)": {
        "median_lifespan_no_treatment": 178,
        "median_lifespan_treatment": 97,
        "maximum_lifespan_no_treatment": "unknown",
        "maximum_lifespan_treatment": "unknown"
    },
    "ENU-treated Apcdel/+ recipien