In [1]:
# Extractors
import regex as re

# Fuzzy matching allows for 1 typo


def extract_medical_record(text):
    """Extracts the medical record number from a given text using regex."""
    pattern = r"(?:Medical\s*Record|Medcal\s*Record|Medicl\s*Record|MR){e<=1}\s*#\s*:\s*(\S+)"
    match = re.search(pattern, text, re.IGNORECASE)

    if match:
        # Check which group matched (1 or 2)
        if match.group(1):
            return match.group(1).strip()
        else:
            return match.group(2).strip()
    else:
        return None  # Or return "NA" if you prefer
   
def extract_name(text):
    """Extracts patient name from a given text using regex."""
    pattern = r"(?:Patient){e<=1}:\s*([^,]+)"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None  # Or return "Unknown" or "NA"

def extract_age(text):
    """Extracts age from a given text using regex patterns."""
    age_patterns = [
        r"(\d+)-(?:year|yeer|yer|yar){e<=1}-old",   # Allow 1 typo in "year"
        r"(?:age|ag|aeg|gae){e<=1}\s*(\d+)",        # Allow 1 typo in "age"
        r"(\d+)\s*(?:years?|yeers?|yrs?|yars?){e<=1}\s*old",  # Allow 1 typo in "years", "yrs"
        r"(\d+)\s*(?:years?|yeers?|yrs?|yars?){e<=1}"         # Allow 1 typo in "years", "yrs"
    ]

    age = None

    for pattern in age_patterns:
        match = re.search(pattern, text, re.IGNORECASE) #added re.IGNORECASE to handle variations in case
        if match:
            try:
                age = int(match.group(1))
                break  # Stop after finding the first match
            except ValueError:
                # Handle cases where the extracted value is not a valid number
                pass
    return age

def extract_gender(text):
    """Extracts gender from a given text using regex."""
    pattern = r"\b(male|female){e<=1}\b"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).capitalize()  # Capitalize and return
    else:
        return "Unknown"  # Or return None if you prefer



In [5]:
#Extract

import pandas as pd
import numpy as np
from IPython.display import display

# Load the data
df = pd.read_excel("notes.xlsx") 

# Extract
df['MR']       = df['note'].apply(extract_medical_record)
df["Name"]     = df["note"].apply(extract_name)
df["Gender"]   = df["note"].apply(extract_gender)
df['Age']      = df['note'].apply(extract_age)

# Cleanup the note on the assumption \n are in the right place (this shortens the prompt)
df['note'] = df['note'].str.split("\n", expand=True, n=2).fillna("")[2]

# Note, remove new lines
df['note'] = df['note'].str.replace("\n", " ")

# Sort the coulmns
df = df[["MR","Name", "Age", "Gender", "note"]]

display(df.head(3))

df.to_csv("checkpoint1.csv", index=True)

Unnamed: 0,MR,Name,Age,Gender,note
0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...
1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...
2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...


In [25]:
# Define the schema for the medical record JSON

json_schema = [
    ["disease", "The disease the patient is suffering from"],
    ["symptoms", "The symptoms the patient is experiencing"],
    ["lab_results", "The results of the lab tests"],
    ["current_medication", "The medication the patient is currently taking"],
    ["current_dosage", "The dosage of the current medication"],
    ["current_frequency", "The frequency of  current medication"],
    ["prescribed_medication", "The medication prescribed"],
    ["prescribed_dosage", "The dosage of the prescribed medication"],
    ["prescribed_frequency", "The frequency of the prescribed medication"],
]

fields= []
decritpion = []
for x in json_schema:
    fields.append(x[0])
    decritpion.append(x[1])



In [3]:
# Check if you can run the model locally
import torch
import os
import dotenv

dotenv.load_dotenv()

%env TRY_LOCAL=True

if os.getenv("TRY_LOCAL") != "True":
    print("If you want to try to run your LLM locally, set TRY_LOCAL to True.")
    print("Let's use ChatGPT")
    %env MODEL=ChatGPT
elif torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    gpu_id = num_gpus - 1 # Arbitrarity run the model on the last GPU, presumably video is on card 0
    %env GPU_ID=$gpu_id
    total_memory = torch.cuda.get_device_properties(gpu_id).total_memory 
    print(f"Total Memory: {total_memory /(1024**3):.1f} GB")
    if total_memory/(1024**3) > 10:
        print("We can run the model locally on gpu ", gpu_id)
        %env MODEL=Local
    else:
        print("Not enough memory to run the model")
        %env MODEL=ChatGPT
else:
    print("No GPU available")
    print("Let's use ChatGPT")
    %env MODEL=ChatGPT


env: TRY_LOCAL=True
env: GPU_ID=1
Total Memory: 23.7 GB
We can run the model locally on gpu  1
env: MODEL=Local


In [6]:
# Load LLM on local GPU

from huggingface_hub import HfApi

from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig,StoppingCriteria, StoppingCriteriaList, pipeline
import torch

import os
import dotenv

dotenv.load_dotenv()

if os.environ.get("MODEL", "") == 'Local':
        
    model_id = "Qwen/Qwen2.5-14B-Instruct"

    device_s = "cuda:" + os.environ.get("GPU_ID", "")
    device = device_s if torch.cuda.is_available() else "cpu"

    print("Loading " + model_id + " on " + device)  

    directory = "/mnt/hd1"
    if os.path.isdir(directory):
        print("Using custom cache directory " + directory)
        custom_cache_dir = "/mnt/hd1"
    else:
        custom_cache_dir = "/tmp"

    #Hugging face API KEY, check it's there and it is valid
    HF_API_KEY = os.getenv("HF_API_KEY")
    token = HF_API_KEY

    if token is None:
        print("No Huggingface API key found")
    else:
        try:
            api = HfApi(token=token)
            user_info = api.whoami()
            print(f"Huggingface authenticated as: {user_info['name']} ")
        except Exception as e:
            print(f"Invalid API key: {e}")

        # Configure bitsandbytes for 4-bit quantization
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",            # "nf4" is a popular choice for quantization
            bnb_4bit_use_double_quant=True,         # enables double quantization for improved accuracy
            bnb_4bit_compute_dtype=torch.bfloat16   # set compute dtype to bfloat16
        )

        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

        class StopOnPeriod(StoppingCriteria):
            def __call__(self, input_ids, scores, **kwargs):
                # Stop if a period followed by a space appears
                return tokenizer.decode(input_ids[0]).endswith(". ")

        stopping_criteria = StoppingCriteriaList([StopOnPeriod()])

        # Load the model with 4-bit quantization
        my_local_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=token,
        cache_dir=custom_cache_dir,
        quantization_config=quant_config,      
        device_map=device,              
        torch_dtype=torch.bfloat16            
        )
else:
    print("This section is disabled, as we are using ChatGPT")

Loading Qwen/Qwen2.5-14B-Instruct on cuda:1
Using custom cache directory /mnt/hd1
Huggingface authenticated as: lookino 


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
from langchain.llms import HuggingFacePipeline  # For local Hugging Face models
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

df_local = pd.read_csv("checkpoint1.csv")

pipe = pipeline("text-generation", model=my_local_model, stopping_criteria=stopping_criteria, tokenizer=tokenizer,generation_kwargs=
                {
                    "temperature": 0.3,  
                    "max_new_tokens": 512,  
                },)

# Initialize LangChain LLM with HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe,pipeline_kwargs={"max_new_tokens": 512},)

# Chat Prompt Template
system_message_template = "you are a medical assistant classifying medical notes. List all the medications in the note in the prompt, only output them in JSON."
system_message_prompt = SystemMessagePromptTemplate.from_template(system_message_template)

human_message_template = "Medical Note: {note}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_message_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

# Create LLMChain
medication_chain = LLMChain(llm=llm, prompt=chat_prompt)

def extract_notes(note):
    medications = medication_chain.run(note=note).strip()
    print("Raw LLM Output:", medications)  # Print raw output
    return medications
        

# Apply the extraction function to the DataFrame
df['medications'] = df['note_clean'].apply(extract_notes)

display(df.head())



NameError: name 'my_local_model' is not defined

In [None]:
#Chat GPT

from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI

import openai
import os
import dotenv
import json

df_gpt = pd.read_csv("checkpoint1.csv")

dotenv.load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORG_ID = os.getenv("OPENAI_ORG_ID")   

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.3)

# Load schema for the LLM output
response_schemas = []

for i in json_schema:
    response_schemas.append(ResponseSchema(name=i[0], description=i[1]))

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Prompt Template
prompt_template = """
Extract the following information from the text and return it as a JSON object:

Text: {text}

{format_instructions}

JSON:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

# Create Runnable Sequence
note_chain = prompt | llm | output_parser

def extract_notes(note):
    try:
        json_notes = note_chain.invoke({"text": note})
        # print("Extracted Info:", json_notes)
        return json_notes
    except Exception as e:
        print(f"Error extracting information: {e}")
        return None

# Apply the extraction function to the DataFrame
df_gpt['json_notes'] = json.dumps(df['note'].apply(extract_notes))

df_gpt.to_csv("checkpoint_chatGPT.csv", index=True)
display(df_gpt.head())

Unnamed: 0.1,Unnamed: 0,MR,Name,Age,Gender,note,json_notes
0,0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...,"{'disease': 'Type 2 Diabetes (T2D), hypertensi..."
1,1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...,"{'disease': 'Type 2 Diabetes', 'symptoms': 'Pe..."
2,2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...,{'disease': 'Hypertension and Type 2 Diabetes'...
3,3,3344,Sarah Johnson,50,female,"Complaint: Frequent urination, increased thirs...","{'disease': 'Type 2 Diabetes', 'symptoms': 'Fr..."
4,4,2211,Carlos Ramirez,55,male,Presenting with ongoing fatigue and left knee ...,"{'disease': 'Type 2 Diabetes, Hypertension', '..."


In [30]:
import json

df_gpt = pd.read_csv("checkpoint_chatGPT.csv")

try:
    df_gpt.drop(columns=fields, inplace=True)
except KeyError:
    pass

display(df_gpt.head())

print(fields)
print(df_gpt["json_notes"][0])  

# Function to parse JSON and extract values
def extract_json(json_string):
    empties = {field: None for field in fields}
    try:
        pds = pd.Series("{'disease': 'Type 2 Diabetes (T2D), hypertension', 'symptoms': 'ongoing fatigue, occasional blurred vision, mild numbness in feet', 'lab_results': 'elevated A1C of 8.7%', 'current_medication': 'Metformin', 'current_dosage': '500 mg', 'current_frequency': 'BID', 'prescribed_medication': 'insulin', 'prescribed_dosage': 'adjust dosage if glucose remains high', 'prescribed_frequency': 'occasionally'}")  # Convert JSON string to dict
        return pds
        #return pd.Series(json_string)  # Convert dict to Series
    except (json.JSONDecodeError, TypeError):
        return pd.Series(empties, index=fields) #handle errors

df_gpt[fields] = df_gpt['json_notes'].apply(extract_json)

df_gpt.to_csv("checkpoint_chatGPT2.csv", index=True)
display(df_gpt.head())

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,MR,Name,Age,Gender,note,json_notes
0,0,0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...,"{'disease': 'Type 2 Diabetes (T2D), hypertensi..."
1,1,1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...,"{'disease': 'Type 2 Diabetes', 'symptoms': 'Pe..."
2,2,2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...,{'disease': 'Hypertension and Type 2 Diabetes'...
3,3,3,3344,Sarah Johnson,50,female,"Complaint: Frequent urination, increased thirs...","{'disease': 'Type 2 Diabetes', 'symptoms': 'Fr..."
4,4,4,2211,Carlos Ramirez,55,male,Presenting with ongoing fatigue and left knee ...,"{'disease': 'Type 2 Diabetes, Hypertension', '..."


['disease', 'symptoms', 'lab_results', 'current_medication', 'current_dosage', 'current_frequency', 'prescribed_medication', 'prescribed_dosage', 'prescribed_frequency']
{'disease': 'Type 2 Diabetes (T2D), hypertension', 'symptoms': 'ongoing fatigue, occasional blurred vision, mild numbness in feet', 'lab_results': 'elevated A1C of 8.7%', 'current_medication': 'Metformin', 'current_dosage': '500 mg', 'current_frequency': 'BID', 'prescribed_medication': 'insulin', 'prescribed_dosage': 'adjust dosage if glucose remains high', 'prescribed_frequency': 'occasionally'}


ValueError: Columns must be same length as key

In [None]:
# OLD

# Request schema
# class TextRequest(BaseModel):
#     prompt: str
#     max_new_tokens: int = 200
#     temperature: float = 0.3
#     top_p: float = 0.9


def extract_medications_native(prompt):
        #prompt = request.prompt 
        messages = [
            {"role": "system",  "content": "you are a medical assistant. List all the medications in the note, only output JSON"},
            {"role": "user", "content": prompt}
        ]

        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 
 
        # Tokenize the prompt (wrapped in a list)
        model_inputs = tokenizer(text, return_tensors="pt").to(my_local_model.device)

        generated = my_local_model.generate(
            **model_inputs,
            max_new_tokens=512,
            num_return_sequences=1,
            temperature=0.3,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
            num_beams=3,
            stopping_criteria=stopping_criteria
        )
        outputs = tokenizer.decode(generated[0], skip_special_tokens=True) 
        
        lines = outputs.split("\n")
        
        # find the line that reads "assistant"
        for i, line in enumerate(lines):
             if "assistant" == line:
                 break
        #return the next line
        
        #concatenate all lines after i+1
        a = ""
        for j in range(i+1, len(lines)):
            a = a + lines[j]

        # print ("relevant row i = ", i+1)
        print(a)
        return a



df['medications2'] = df['note'].apply(extract_medications_native)

display(df.head())

{  "medications": [    {      "name": "Metformin",      "dose": "500 mg",      "frequency": "BID"    },    {      "name": "Insulin",      "dose": "occasional",      "frequency": "as needed"    },    {      "name": "Lisinopril",      "dose": "20 mg",      "frequency": "daily"    }  ]}
{  "medications": [    "Metformin 500 mg BID",    "Atorvastatin 10 mg daily"  ]}
{  "medications": [    "Metformin 1000 mg daily",    "Lisinopril 20 mg daily",    "Multivitamin (brand unknown)"  ]}
{  "medications": [    "Glipizide 5 mg daily",    "Simvastatin 20 mg daily"  ]}
{  "medications": [    "Metformin 500 mg BID",    "Jardiance 10 mg daily",    "Losartan 50 mg daily"  ]}
{  "medications": [    "Metformin 500 mg daily"  ]}
{  "medications": [    "Metformin 500 mg BID",    "Insulin glargine (10 units nightly)",    "Amlodipine 5 mg daily"  ]}
{  "medications": [    {      "name": "Metformin",      "dose": "500 mg",      "frequency": "once daily"    }  ]}
{  "medications": [    "Metformin 1000 mg BID"

Unnamed: 0,MR,Name,Age,Gender,note,medications2
0,1234,John Smith,58,Male,Arrived for a follow-up appointment complainin...,"{ ""medications"": [ { ""name"": ""Metform..."
1,5678,Linda Green,45,Female,Chief Complaint: Persistent headaches and ting...,"{ ""medications"": [ ""Metformin 500 mg BID"",..."
2,9102,Michael Brown,62,Male,Follow-up for hypertension and Type 2 Diabetes...,"{ ""medications"": [ ""Metformin 1000 mg dail..."
3,3344,Sarah Johnson,50,Female,"Complaint: Frequent urination, increased thirs...","{ ""medications"": [ ""Glipizide 5 mg daily"",..."
4,2211,Carlos Ramirez,55,Male,Presenting with ongoing fatigue and left knee ...,"{ ""medications"": [ ""Metformin 500 mg BID"",..."


In [14]:
print(df['medications2'][15])

{  "medications": [    "Metformin 1000 mg BID",    "Rosuvastatin 10 mg daily"  ]}
