In [None]:
# Extractors
import regex as re

# Fuzzy matching allows for 1 typo

def extract_medical_record(text):
    """Extracts the medical record number from a given text using regex."""
    pattern = r"(?:Medical\s*Record|Medcal\s*Record|Medicl\s*Record|MR){e<=1}\s*#\s*:\s*(\S+)"
    match = re.search(pattern, text, re.IGNORECASE)

    if match:
        # Check which group matched (1 or 2)
        if match.group(1):
            return match.group(1).strip()
        else:
            return match.group(2).strip()
    else:
        return None  # Or return "NA" if you prefer
   
def extract_name(text):
    """Extracts patient name from a given text using regex."""
    pattern = r"(?:Patient){e<=1}:\s*([^,]+)"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None  # Or return "Unknown" or "NA"

def extract_age(text):
    """Extracts age from a given text using regex patterns."""
    age_patterns = [
        r"(\d+)-(?:year|yeer|yer|yar){e<=1}-old",   # Allow 1 typo in "year"
        r"(?:age|ag|aeg|gae){e<=1}\s*(\d+)",        # Allow 1 typo in "age"
        r"(\d+)\s*(?:years?|yeers?|yrs?|yars?){e<=1}\s*old",  # Allow 1 typo in "years", "yrs"
        r"(\d+)\s*(?:years?|yeers?|yrs?|yars?){e<=1}"         # Allow 1 typo in "years", "yrs"
    ]

    age = None

    for pattern in age_patterns:
        match = re.search(pattern, text, re.IGNORECASE) #added re.IGNORECASE to handle variations in case
        if match:
            try:
                age = int(match.group(1))
                break  # Stop after finding the first match
            except ValueError:
                # Handle cases where the extracted value is not a valid number
                pass
    return age

def extract_gender(text):
    """Extracts gender from a given text using regex."""
    pattern = r"\b(male|female){e<=1}\b"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).capitalize()  # Capitalize and return
    else:
        return "Unknown" 


In [None]:
#Extract

import pandas as pd
import numpy as np
from IPython.display import display

# Load the data
df = pd.read_excel("notes.xlsx") 

# Extract
df['MR']       = df['note'].apply(extract_medical_record)
df["Name"]     = df["note"].apply(extract_name)
df["Gender"]   = df["note"].apply(extract_gender)
df['Age']      = df['note'].apply(extract_age)

# Cleanup the note on the assumption \n are in the right place (this shortens the prompt)
df['note'] = df['note'].str.split("\n", expand=True, n=2).fillna("")[2]

# Note, remove new lines
df['note'] = df['note'].str.replace("\n", " ")

# Sort the coulmns
df = df[["MR","Name", "Age", "Gender", "note"]]

display(df.head(3))

df.to_csv("checkpoint1.csv", index=False)

Unnamed: 0,MR,Name,Age,Gender,note
0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...
1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...
2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...


In [3]:
# Define the schema for the medical record JSON

json_schema = [
    ["disease", "The disease the patient is suffering from"],
    ["symptoms", "The symptoms the patient is experiencing"],
    ["lab_results", "The results of the lab tests"],
    ["current_medication", "The medication the patient is currently taking"],
    ["current_dosage", "The dosage of the current medication, just the dosage, not the name"],
    ["current_frequency", "The frequency of  current medication, just the frequency, not the name"],
    ["prescribed_medication", "The medication prescribed"],
    ["prescribed_dosage", "The dosage of the prescribed medication"],
    ["prescribed_frequency", "The frequency of the prescribed medication"],
]

fields= []
decritpion = []
for x in json_schema:
    fields.append(x[0])
    decritpion.append(x[1])



In [None]:
# Check if you can run the model locally
import torch
import os
import dotenv

dotenv.load_dotenv()

%env TRY_LOCAL=False

if os.getenv("TRY_LOCAL") != "True":
    print("If you want to try to run your LLM locally, set TRY_LOCAL to True.")
    print("Let's use ChatGPT")
    %env MODEL=ChatGPT
elif torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    gpu_id = num_gpus - 1 # Arbitrarity run the model on the last GPU, presumably video is on card 0
    %env GPU_ID=$gpu_id
    total_memory = torch.cuda.get_device_properties(gpu_id).total_memory 
    print(f"Total Memory: {total_memory /(1024**3):.1f} GB")
    if total_memory/(1024**3) > 20:
        print("We can run the model locally on gpu ", gpu_id)
        print(f"You need 20Gb of VRAM, you have {total_memory / (1024**3):.1f} Gb")
        %env MODEL=Local
    else:
        print("Not enough memory to run the model")
        print(f"You need 20Gb of VRAM, you have {total_memory / (1024**3):.1f} Gb")
        %env MODEL=ChatGPT
else:
    print("No GPU available")
    print("Let's use ChatGPT")
    %env MODEL=ChatGPT


env: TRY_LOCAL=True
env: GPU_ID=1
Total Memory: 23.7 GB
We can run the model locally on gpu  1
You need 20Gb of VRAM, you have 23.7 Gb
env: MODEL=Local


In [5]:
# Load LLM on local GPU

from huggingface_hub import HfApi
import gptqmodel

from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig,StoppingCriteria, StoppingCriteriaList, pipeline
import torch
import gc

import os
import dotenv

dotenv.load_dotenv()

if os.environ.get("MODEL", "") == 'Local':

    torch.cuda.empty_cache()  # Clear cached memory
    gc.collect() 
    print("GPU memory cleared.")
        
    # model_id = "Qwen/Qwen2.5-14B-Instruct"
    model_id = "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4"

    # Set the device defined in previous cell
    device_s = "cuda:" + os.environ.get("GPU_ID", "")
    device = device_s if torch.cuda.is_available() else "cpu"

    print("Loading " + model_id + " on " + device)  

    directory = "/mnt/hd1"
    if os.path.isdir(directory):
        print("Using custom cache directory " + directory)
        custom_cache_dir = "/mnt/hd1"
    else:
        custom_cache_dir = "/tmp"

    #Hugging face API KEY, check it's there and it is valid
    HF_API_KEY = os.getenv("HF_API_KEY")
    token = HF_API_KEY

    if token is None:
        print("No Huggingface API key found")
    else:
        try:
            api = HfApi(token=token)
            user_info = api.whoami()
            print(f"Huggingface key is valid. Logged as: {user_info['name']} ")
        except Exception as e:
            print(f"Invalid API key: {e}")

        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

        # Load the model 
        my_local_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=token,
        cache_dir=custom_cache_dir,    
        device_map=device,              
        torch_dtype=torch.bfloat16,           
        )
else:
    print("This section is disabled, as we are using ChatGPT")


[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          
GPU memory cleared.
Loading Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4 on cuda:1
Using custom cache directory /mnt/hd1
Huggingface key is valid. Logged as: lookino 


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


[32mINFO[0m   Kernel: Auto-selection: adding candidate `MarlinQuantLinear`            


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_huggingface import HuggingFacePipeline
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.output_parsers import PydanticOutputParser

import os
import dotenv
import json
import pandas as pd

dotenv.load_dotenv()

df_local = pd.read_csv("checkpoint1.csv")

class StopAtEndOfJson(StoppingCriteria):
    def __call__(self, input_ids, scores, **kwargs):
        # Stop if a period followed by a space appears
        return tokenizer.decode(input_ids[0]).endswith("\"\n}\n````")

stopping_criteria = StoppingCriteriaList([StopAtEndOfJson()])

pipe = pipeline(
    "text-generation",
    model=my_local_model,
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0.3,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id,
)

# Initialize LangChain LLM with HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)


# Load schema for the LLM output
response_schemas = []

for i in json_schema:
    response_schemas.append(ResponseSchema(name=i[0], description=i[1]))

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Prompt Template
prompt_template = """
Extract the following information from the text and return it as a JSON object:

Text: {text}

{format_instructions}

JSON:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

def preprocess_llm_output(llm_output: str) -> str:
    """Preprocesses the LLM output to extract the JSON block."""
    
    llm_output = llm_output.split("JSON:\n```json", 1)[1]
    llm_output = llm_output.split("}\n```", 1)[0] + "}"
    llm_output.strip()
    return llm_output
    #print("LLM Output:", llm_output)


note_chain = prompt | llm | preprocess_llm_output | output_parser

# Create Runnable Sequence

def extract_notes(note):
    try:
        my_output = note_chain.invoke({"text": note})   
        json_note = json.dumps(my_output, indent=4)
        return json_note
    except Exception as e:
        print(f"Error extracting information: {e}")
        return None

# Apply the extraction function to the DataFrame
df_local['json_note'] = df_local['note'].apply(extract_notes)

df_local.to_csv("checkpoint_Qwen.csv", index=True)
display(df_local.head())



Device set to use cuda:1
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0.1,Unnamed: 0,MR,Name,Age,Gender,note,json_note
0,0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...,"{\n ""disease"": ""Type 2 Diabetes"",\n ""sym..."
1,1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...,"{\n ""disease"": ""Type 2 Diabetes"",\n ""sym..."
2,2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...,"{\n ""disease"": ""hypertension and Type 2 Dia..."
3,3,3344,Sarah Johnson,50,female,"Complaint: Frequent urination, increased thirs...","{\n ""disease"": ""Type 2 Diabetes"",\n ""sym..."
4,4,2211,Carlos Ramirez,55,male,Presenting with ongoing fatigue and left knee ...,"{\n ""disease"": ""Type 2 Diabetes, Hypertensi..."


In [None]:
#Chat GPT

from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_openai import ChatOpenAI

import os
import dotenv
import json

df_gpt = pd.read_csv("checkpoint1.csv")

dotenv.load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORG_ID = os.getenv("OPENAI_ORG_ID")   

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.3)

# Load schema for the LLM output
response_schemas = []

for i in json_schema:
    response_schemas.append(ResponseSchema(name=i[0], description=i[1]))

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Prompt Template
prompt_template = """
Extract the following information from the text and return it as a JSON object:

Text: {text}

{format_instructions}

JSON:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

# Create Runnable Sequence
note_chain = prompt | llm | output_parser

def extract_notes(note):
    try:
        json_note = json.dumps(note_chain.invoke({"text": note}))
        return json_note
    except Exception as e:
        print(f"Error extracting information: {e}")
        return None

# Apply the extraction function to the DataFrame
df_gpt['json_note'] = df['note'].apply(extract_notes)

df_gpt.to_csv("checkpoint_chatGPT.csv", index=False)
display(df_gpt.head())

Unnamed: 0.1,Unnamed: 0,MR,Name,Age,Gender,note,json_note
0,0,1234,John Smith,58,male,Arrived for a follow-up appointment complainin...,"{""disease"": ""Type 2 Diabetes (T2D), hypertensi..."
1,1,5678,Linda Green,45,female,Chief Complaint: Persistent headaches and ting...,"{""disease"": ""Type 2 Diabetes"", ""symptoms"": ""Pe..."
2,2,9102,Michael Brown,62,male,Follow-up for hypertension and Type 2 Diabetes...,"{""disease"": ""Hypertension and Type 2 Diabetes""..."
3,3,3344,Sarah Johnson,50,female,"Complaint: Frequent urination, increased thirs...","{""disease"": ""Type 2 Diabetes"", ""symptoms"": ""Fr..."
4,4,2211,Carlos Ramirez,55,male,Presenting with ongoing fatigue and left knee ...,"{""disease"": ""Type 2 Diabetes"", ""symptoms"": ""on..."


In [7]:
# Extract the JSON fields from the JSON notes

def extract_json(filename):
    df = pd.read_csv(filename)
    try:
        df.drop(columns=fields, inplace=True)
    except KeyError:
        pass

    # Function to parse JSON and extract values
    def extract_json(json_string):
        empties = {field: None for field in fields}
        try:
            return pd.Series(json.loads(json_string))
        except (json.JSONDecodeError, TypeError):
            return pd.Series(empties, index=fields) #handle errors

    df[fields] = df['json_note'].apply(extract_json)

    df.drop(columns=["json_note"], inplace=True)

    new_filename = filename.replace(".csv", "_2.csv")

    df.to_csv(new_filename, index=False)


# Run it on both
extract_json("checkpoint_chatGPT.csv")
extract_json("checkpoint_Qwen.csv")