In [2]:
import kor
import spacy
import pandas as pd
import en_core_sci_md
from langchain.llms import OpenAI
import numpy as np
import os
import tiktoken
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from kor import create_extraction_chain
# from langchain.document_loaders.csv_loader import CSVLoader

from negspacy.negation import Negex
from negspacy.termsets import termset

#Define the OpenAI API key
aiKey = "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"
os.environ["OPENAI_API_KEY"] = aiKey


#Use langchain openai model
llm = OpenAI(openai_api_key=aiKey, model="text-davinci-003", temperature=0.1) # type: ignore
nlp = spacy.load("en_core_sci_md" )
ts = termset("en_clinical")

nlp.add_pipe("entity_linker")
nlp.add_pipe(
    "negex",
    config={
        "neg_termset":ts.get_patterns()
    }
)

csvPath = r"C:\Users\jreno\Documents\Projects\Mycoach Health\NOTEEVENTS\NOTEEVENTS.csv"
# loader = CSVLoader(csvPath)

'''data = loader.load()
index_creator = VectorstoreIndexCreator()
docsearch = index_creator.from_loaders([loader])
'''


'data = loader.load()\nindex_creator = VectorstoreIndexCreator()\ndocsearch = index_creator.from_loaders([loader])\n'

In [3]:
import pydantic
from pydantic import Field, BaseModel
from typing import Optional
from kor import Object, Text, Number
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain_pydantic
from langchain.prompts import ChatPromptTemplate

#Format below given clinicalAbbr as key value pairs between abbr and description
abbr = { "CSF": "cerebrospinal fluid", 
                "CSU": "catheter stream urine sample",
                "CT scan": "computerised tomography scan",
                "CVP": "central venous pressure", 
                "CXR": "chest X-ray",
                "DNACPR": "do not attempt cardiopulmonary resuscitation",
                "DNAR": "do not attempt resuscitation",
                "DNR": "do not resuscitate", "Dr": "doctor",
                "DVT": "deep vein thrombosis",
                "Dx": "diagnosis",
                "ECG": "electrocardiogram", 
                "ED": "emergency department"
            }

class patient(pydantic.BaseModel):
    patientName: str = Field(
        description="The name of the patient, First Name and Last Name concatenated",
        examples = "[**First Name (NamePattern)**]"
    )
    diagnosisData: str = Field(
        description="The summary of the diagnosis of the patient or the final diagnosis of the patient",
    )
    time : Optional[str] = Field(
        description="The time of the diagnosis of the patient",
    )
    clinicalAbbr : Optional[str] = Field(
        description="The clinical abbreviations present in the diagnosis of the patient",
        #Write an example for the same including a clinical abbreviation for the model to detect
        examples = abbr
    )
    medsData : Optional[str] = Field(
        description="The medications of the patient prescribed or administered to the patient, might have abbreviations",
        examples = ["Paracetamol 75mcg p.o." , "Aspirin 81mg p.i. q.d."]
    )
    dischargeData : Optional[str] = Field(
        description="The summary of the discharge of the patient, including the medical status of the patient",
        example= "The patient was able to oxygenate on room air at 93% at the time of discharge."
    )
    medicalScans : Optional[list] = Field(
        description="The summary of the medical scans of the patient, including but not limited to CT Scans, MRIs and X-Ray Scans",
        example = [""]
    )
    additionalInfo : Optional[str] = Field(
        description="Any additional information about the patient, including but not limited to the patient's medical history, allergies, etc.",
        example = "The patient has had a history of asthma."
    )
    

#Printing the patient's name and diagnosis


In [5]:
#Define the prompting template for the extraction and summarisation of patient data\
from langchain.prompts import ChatPromptTemplate, Prompt, PromptTemplate, MessagesPlaceholder
from langchain.tools import PubmedQueryRun


#Import the libraries whcih ahvent been imported but are being invoked
from langchain.tools.base import ToolException

from langchain.agents import AgentType, initialize_agent, AgentExecutor
from langchain.memory import ConversationBufferMemory, ChatMessageHistory
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.tools import Tool

from langchain.chat_models import ChatOpenAI
from typing import Dict, Tuple

from langchain.chains import RetrievalQA
import openai

modelName = "text-embedding-ada-002"
embed = OpenAIEmbeddings(
            client=openai,
            openai_api_key=aiKey
        )

class LLMConfig():
    model = "text-davinci-003"
    llm = llm

def setup_memory() -> Tuple[Dict, ConversationBufferMemory]:
    """
    Sets up memory for the open ai functions agent.
    :return a tuple with the agent keyword pairs and the conversation memory.
    """
    agent_kwargs = {
        "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
    }
    memory = ConversationBufferMemory(memory_key="memory", return_messages=True)

    return agent_kwargs, memory

def setupAgent() -> AgentExecutor :
    pubmed = medTool
    cfg = LLMConfig()
    tools = [
        Tool.from_function(
            func=pubmed.run, 
            name='PubMed',
            description='Useful tool for querying medical publications'
        )
    ]
    agent_kwargs, memory = setup_memory()
    
    return initialize_agent(
        tools, 
        llm, 
        agent=AgentType.OPENAI_FUNCTIONS, 
        verbose=False, 
        agent_kwargs=agent_kwargs,
        memory=memory
    )

#Tokenizer len function
'''def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)'''




#Define the prompt template for the extraction and summarisation of patient data
prompt_template = '''
                    Imagine you are a medical professional with years of experience in various diseases, their clinical terminologies, how they worsen or improve over time and an expert in pharmaceuticals and medication.
                    You have access to tools such as PubMedQuery which will help you connect to Pubmed and retrieve relevant clinical references according to the patient diagnosis. Also, you have access to the context and the structured data of the patient's clinical
                    records. You have been asked to summarise the patient's condition, diagnosis, medication, relevant procedures and try to augment the data at hand to get useful insights and a better understanding.
                    Both the data variables are JSON objects which are fed to you (might be embedded or not), make sure you interpret them correctly.
                    
                    Context: This is the given context for the aprticular patient data being taught to you. {contextData}
                    Patient Data: This is the given patient data for you to handle and infer from based on the context, step-by-step semantic analysis. {patientData}
                    
                    Break down the information you have step-by-step and summarise it.
                '''
   
medTool = PubmedQueryRun()
prompt = PromptTemplate(template=prompt_template, input_variables=["contextData", "patientData"])

In [10]:
#Parse nlp pipe through the csv doc loader
from langchain.text_splitter import SpacyTextSplitter
from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.mapreduce import MapReduceChain

#Define the text splitter
tokenizer = tiktoken.get_encoding('p50k_base')
textSplitter = SpacyTextSplitter(pipeline="en_core_sci_md")


df = pd.read_csv(csvPath, nrows=1000)
df = df.TEXT.to_list()

chain = create_extraction_chain_pydantic(patient, llm)
embedder = SpacyEmbeddings(nlp=en_core_sci_md.load())
summaryChain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_results=True, map_prompt=prompt, combine_prompt=prompt)


count, lim = 0, 10
overallData = [] #List of Objects

for item in df:
    #Pass the items through nlp pipeline 
    if count == lim: break
    
    count += 1
    splitText = textSplitter.split_text(item)
    embeddings = embedder.embed_documents(splitText)
    
    # contextData = nlp.pipe(str(item), n_process=5) pipe or regular nlp parse ? Less time consuming but iterator object handling
    contextData = nlp(str(item))
    patientData = chain.run(embeddings)
    
    # embeddings = embedder.embed_documents(splitText)
    contextData = contextData.to_json()
    singlePatient = {**patientData, **contextData} # type: ignore
    overallData = []    
    summaryPatient = summaryChain.run(contextData, patientData)
    
    #Can embed queries and retrieve from an in-memory store but need storage after this for long term persistence

ValidationError: 1 validation error for StuffDocumentsChain
__root__
  document_variable_name text was not found in llm_chain input_variables: ['contextData', 'patientData'] (type=value_error)