# Versa-spine 
A fully automized pipeline for information extraction from lumbar spine MRI radiology reports using GPT4

In [None]:
# Import of packages
import openai
import os
import dotenv
import pandas as pd
import re
from dotenv import load_dotenv, find_dotenv
from openai import AzureOpenAI
dotenv.load_dotenv('.env')


API_KEY = os.environ.get('API_KEY')
API_VERSION = os.environ.get('API_VERSION')
RESOURCE_ENDPOINT = os.environ.get('RESOURCE_ENDPOINT')


In [None]:
use_azure_active_directory = False

if not use_azure_active_directory:
    endpoint = os.environ["RESOURCE_ENDPOINT"]
    api_key = os.environ["API_KEY"]

    client = openai.AzureOpenAI(
        azure_endpoint=endpoint,
        api_key=api_key,
        api_version="2023-09-01-preview"
    )

## Data selection and preprocessing

In [None]:
# --- Load note text for information extraction ---
file_name = # TODO: path to csv file with note text
df = pd.read_csv(file_name)

def extract_impressions(report_text):
    pattern = r"(FINDINGS.*?)(?=Report dictated by)"
    match = re.search(pattern, report_text, flags=re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(0)
    return "Impressions section not found."

# Extract the impressions section
df['impressions'] = df['note_text'].apply(extract_impressions)

# Remove the word "significant" from all impressions
df['impressions'] = df['impressions'].str.replace(r'\bsignificant\b', '', case=False, regex=True)

df.head()

In [None]:

data_store = {
    "PathologiesLevel": {
        1: {"name": "Endplate Changes",
            "abbr": "endplate",
            "loc": "level",
            "syn": "Modic changes, Modic endplate changes, fibrovascular degenerative changes, fibrovascular changes, fibrofatty degenerative changes, fibrofatty changes, endplate sclerosis, endplate degeneration, endplate irregularity, endplate irregularities, endplate defect, endplate defects, Schmorl's node, schmorls node" 
            },
        2: {"name": "Disc Pathology",
            "abbr": "disc",
            "loc": "level",
            "syn": "disc bulge, disc bulging, disc protrusion, disc extrusion, annular fissure, disc tear, annular tear, disc herniation" 
            },
        3: {"name": "Spinal Canal Stenosis",
            "abbr": "scs",
            "loc": "level",
            "syn": "spinal canal stenosis, spinal canal narrowing, central canal stenosis, central canal narrowing, canal stenosis, canal narrowing"
            },
        4: {"name": "Facet Joint Arthropathy",
            "abbr": "fj",
            "loc": "level",
            "syn": "facet joint degeneration, facet joint arthropathy, facet degeneration, facet arthropathy, facet hypertropthy"
            }
            }, 
    "PathologiesLevelSide": {
        1: {"name": "Lateral Recess Stenosis",
            "abbr": "lrs",
            "loc": "level_side",
            "syn": "lateral recess stenosis, subarticular recess stenosis, recess stenosis, lateral recess narrowing, subarticular recess narrowing, recess narrowing, narrowing of lateral recess, narrowing of subarticular recess, stenosis of lateral recess, stenosis of subarticular recess, effacement of lateral recess, effacement of subarticular recess"
            },
        2: {"name": "Foraminal Stenosis",
            "abbr": "fs",
            "loc": "level_side",
            "syn": "neural foraminal stenosis, neural foraminal narrowing, neural foraminal effacement, neural foraminal nerve root affection, foraminal stenosis, foraminal narrowing, foraminal effacement, foraminal nerve root affection, neuroforaminal stenosis, neuroforaminal narrowing, neuroforaminal effacement, neuroforaminal nerve root affection,"
            },
    },
    "PathologiesPatient": {
        1: {"name": "Sacroiliac Joint",
            "abbr": "sij",
            "loc": "patient",
            "syn": "sacroiliac joint degeneration, degeneration of sacroiliac joints, sacro-iliac joint degeneration, degeneration of sacro-iliac joints, SIJ degeneration, degeneration of SIJ, degenerative changes of the sacroiliac joints, degenerative changes of the sacro-iliac joints"
            },  
        2: {"name": "Olisthesis",
            "abbr": "olisth",
            "loc": "patient",
            "syn": "anterolisthesis, retrolisthesis, spondylolysis, pseudo-anterolisthesis, pseudo-retrolisthesis, vertebral displacement"
            },
        3: {"name": "Curvature",
            "abbr": "curv",
            "loc": "patient",
            "syn": "scoliosis, levoconvex curvature, dextroconvex curvature, leftward convex curvature, rightward convex curvature, levocurvature, dextrocurvature, levoscoliosis, dextroscoliosis, S-shaped curvature"
            },
        4: {"name": "Fracture",
            "abbr": "frac",
            "loc": "patient",
            "syn": "fracture, osteoporotic fracture, osteoporotic deformation, wedge deformity"
            }   
    },
        "OutputFormats": {
        "level":{
            "loc":"level",
            "output": "As a result, give me a list with exactly 20 entries, grouped by pathology. It must contain five entries for each pathology, one for each of the five vertebral levels (L1-2 to L5-S1). For enplate changes, disc pathology and facet joint arthropthy, give only entries of 0 (for pathology absent) or 1 (for pathology present). For spinal canal stenosis the entry must be 0 if there is no spinal canal stenosis, 1 if it is described as mild, 2 if it is described as moderate of no further qualification of stenosis extent is given, and 3 if it is described as severe. Entries in the list must always adhere to this format. Here are three example entries: Endplate Changes L1-L2: 0, Disc Pathology L5-S1: 1, Spinal Canal Stenosis: 3. Ignore levels named ALPHANUMERICID. End the list with 'END OF LIST'."
        },
        "level_side":{
            "loc":"level_side",
            "output": "As a result, give me a list with exactly 20 entries, grouped by pathology; each entry must be on a new line, do not use commas to separate entries. It must contain ten entries for each pathology, two for each of the five vertebral levels (L1-2 to L5-S1), one for the right and one for the left side at each level. The entry must be 0 if there is no mention of a pathology at this level, 1 if the pathology is described as mild, 2 if it is described as moderate or there is no further qualification of the extent of the pathology, and 3 if it is described as severe. Entries in the list must always adhere to this format. Here are two example entries: Foraminal Stenosis L1-L2 right: 2, Lateral Recess Stenosis L5-S1 left: 0. Ignore levels named ALPHANUMERICID. End the list with 'END OF LIST'."
        },
        "patient":{
            "loc":"patient",
            "output": "As a result, give me a list with exactly 4 entries. It must contain one entry for each pathology with corresponding entries of either 1 or 0. Entries in the list must always adhere to this format. Here are two example entries: Sacroiliac joint: 0, Fracture: 1. Ignore levels named ALPHANUMERICID. End the list with 'END OF LIST'."
        },
    },
    "InterpretationGuidance": {
        "bilateral_changes":{
            "patterns": ["left greater than right", "right greater than left", "bilateral"],
            "guidance": "Consider phrases like 'bilateral','left greater than right' or 'right greater than left' as presence of changes on both sides. Please apply this rule strictly in your interpretation."
        },
        "segment_localization":{
            "patterns": ["superior endplate", "inferior endplate"],
            "guidance": "If a change is described as localized at the superior endplate, attribute it to the level above this vertebral body (e.g. superior endplate L2 belongs to the level L1-L2); conversely the inferior endplate belongs to the segment of below its vertebral body (e.g. inferior endplate L3 belongs to the Level L3-L4)."
        },
        "multilevel":{
            "patterns": ["multilevel"],
            "guidance": "If a pathology is described as 'multilevel' assume it is present in all vertebral levels."
        },
        "desiccation":{
            "patterns": ["desiccation"],
            "guidance": "Do not consider desiccation or darkening of discs a pathology."
        },
        "heightloss":{
            "patterns": ["height"],
            "guidance": "Do not consider height loss of a disc a pathology."
        },
        "straight":{
            "patterns": ["straightening"],
            "guidance": "Do not consider straightening or loss of lumbar lordosis a pathology."
        },
        "significant":{
            "patterns": ["significant"],
            "guidance": "Consider pathologies described as 'not significant' as not present."
    },
}
}



## Prompting functions

Initialize output columns

In [None]:
def normalize_and_convert_to_column_name(level, abbr, side=None):
    normalized_level = level.lower().split('-')[0].split('l')[1]
    if side:
            return f"gpt_{abbr}_lvl_l{normalized_level}_{side}"
    else:
        return f"gpt_{abbr}_lvl_l{normalized_level}"

def initialize_dataframe_columns(data_store, df):
    # Initialize columns for level pathologies
    if "PathologiesLevel" in data_store:
        for pathology_id, pathology_info in data_store["PathologiesLevel"].items():
            abbr = pathology_info["abbr"]
            for level in ["l1", "l2", "l3", "l4", "l5"]:
                column_name = normalize_and_convert_to_column_name(level, abbr)
                if column_name not in df.columns:
                    df[column_name] = pd.NA

    # Initialize columns for level_side pathologies
    if "PathologiesLevelSide" in data_store:
        for pathology_id, pathology_info in data_store["PathologiesLevelSide"].items():
            abbr = pathology_info["abbr"]
            for level in ["l1", "l2", "l3", "l4", "l5"]:
                for side in ["left", "right"]:
                    column_name = normalize_and_convert_to_column_name(level, abbr, side)
                    if column_name not in df.columns:
                        df[column_name] = pd.NA

    # Initialize columns for patient-level pathologies
    if "PathologiesPatient" in data_store:
        for pathology_id, pathology_info in data_store["PathologiesPatient"].items():
            abbr = pathology_info["abbr"]
            column_name = f"gpt_{abbr}_patient"
            if column_name not in df.columns:
                df[column_name] = pd.NA


initialize_dataframe_columns(data_store, df)

Dynamic prompting with interpretation guidance depending on specific wordings

In [None]:
def append_guidance_if_needed(report_text, base_prompt, data_store):
    appended_guidances = set()  # Set to keep track of appended guidances
    
    for guidance in data_store["InterpretationGuidance"].values():
        for pattern in guidance["patterns"]:
            if pattern.lower() in report_text.lower():
                # If the pattern is found in the report and guidance not already added, append the guidance to the prompt
                if guidance["guidance"] not in appended_guidances:
                    base_prompt += f"\n\n[Guidance: {guidance['guidance']}]"
                    appended_guidances.add(guidance["guidance"])
                break  # Break if at least one pattern matches to avoid duplicate guidance within the same category

    return base_prompt


In [None]:
def parse_level_output(line, pathology_abbr):
    # Adjusting for potential trailing commas and spaces
    line = line.rstrip(", ")
    match = re.match(r"(.+?) (L\d+(-S\d+)?)\s*:\s*(\d+)", line, re.IGNORECASE)
    if match:
        _, level, _, presence = match.groups()
        # Adjusting level normalization to handle "L5-S1"
        normalized_level = "".join(filter(str.isdigit, level))
        column_name = f"gpt_{pathology_abbr}_lvl_l{normalized_level}"
        return column_name, int(presence)
    return None, None

def parse_level_side_output(line, pathology_abbr):
    line = line.rstrip(", ")
    match = re.match(r"(.+?)\s+(L\d+(-S\d+)?)\s*(left|right)?\s*:\s*(\d+)", line, re.IGNORECASE)
    if match:
        _, level, _, side, presence = match.groups()
        normalized_level = "".join(filter(str.isdigit, level))
        side_suffix = side.lower()
        column_name = f"gpt_{pathology_abbr}_lvl_l{normalized_level}_{side_suffix}"
        return column_name, int(presence)
    return None, None

def parse_and_update_patient_output(analysis_result, data_store, df, index):
    # Split the output into individual pathology reports
    pathology_reports = analysis_result.split(', ')
    for report in pathology_reports:
        # Attempt to match each report to the expected format
        match = re.match(r"(.+?): (\d)", report)
        if match:
            pathology_name, presence_str = match.groups()
            presence = int(presence_str)

            # Find the corresponding abbreviation and column name for the pathology
            for pathology_info in data_store["PathologiesPatient"].values():
                if pathology_name.lower() == pathology_info["name"].lower():
                    abbr = pathology_info["abbr"]
                    column_name = f"gpt_{abbr}_patient"

                    # Update the DataFrame
                    df.at[index, column_name] = presence
                    break



## Prompting pipeline - main instruction

Query VERSA for level-wise pathologies

In [None]:
for index, row in df.iterrows():
    report_text = row['impressions']
    pathologies_info = [f"{info['name']} ({info['abbr']})" for info in data_store["PathologiesLevel"].values()]
    prompt_info = ", ".join(pathologies_info)
    output_instruction = data_store["OutputFormats"]["level"]["output"]
    specific_prompt = f"Given the following pathologies: {prompt_info}, {output_instruction}\n\nReport: {report_text}\n\n"
    enhanced_prompt = append_guidance_if_needed(report_text, specific_prompt)
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": enhanced_prompt},
            {"role": "system", "content": "You are a medical expert, confident in the interpretation of radiology reports; you do not assume the presence of a pathology if it is not mentioned."}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        stop=["END OF LIST"]
    )
    analysis_result = response.choices[0].message.content.strip()
    
    lines = analysis_result.strip().split('\n')
    for line in lines:
        parts = line.split(': ')
        if len(parts) == 2:
            pathology_with_level, presence_str = parts
            presence = int(presence_str.strip().rstrip(','))

            match = re.match(r"(.+?) (L\d+-[LS]?\d+)", pathology_with_level)
            if match:
                pathology_name, level = match.groups()

                for pathology_info in data_store["PathologiesLevel"].values():
                    if pathology_name.lower() == pathology_info["name"].lower():
                        abbr = pathology_info["abbr"]
                        column_name = normalize_and_convert_to_column_name(level, abbr)
                        df.at[index, column_name] = presence
                        break

Query VERSA for level- and side-wise pathologies

In [None]:
for index, row in df.iterrows():
    report_text = row['impressions']
    # Extract pathology names and abbreviations from PathologiesLevelSide
    pathologies_info = [f"{info['name']} ({info['abbr']})" for info in data_store["PathologiesLevelSide"].values()]
    prompt_info = ", ".join(pathologies_info)
    
    # Use the level_side-specific output instruction
    output_instruction = data_store["OutputFormats"]["level_side"]["output"]
    specific_prompt = f"Given the following pathologies: {prompt_info}, {output_instruction}\n\nReport: {report_text}\n\n"
    enhanced_prompt = append_guidance_if_needed(report_text, specific_prompt)
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": enhanced_prompt},
            {"role": "system", "content": "You are a medical expert, confident in the interpretation of radiology reports."}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        stop=["END OF LIST"]
    )
    analysis_result = response.choices[0].message.content.strip()

   

    lines = analysis_result.strip().split('\n')
    for line in lines:
        parts = line.split(': ')
        if len(parts) == 2:
            pathology_with_level_side, presence_str = parts
            presence = int(presence_str.strip().rstrip(','))

            match = re.match(r"(.+?) (L\d+-[LS]?\d+) (left|right)", pathology_with_level_side)
            if match:
                pathology_name, level, side = match.groups()

                for pathology_info in data_store["PathologiesLevelSide"].values():
                    if pathology_name.lower() == pathology_info["name"].lower():
                        abbr = pathology_info["abbr"]
                        # Adjusting the function to include side information
                        column_name = normalize_and_convert_to_column_name(level, abbr, side)
                        df.at[index, column_name] = presence
                        break
    

Query VERSA for patient pathologies

In [None]:
for index, row in df.iterrows():
    report_text = row['impressions']
    # Extract pathology names and abbreviations from PathologiesPatient
    pathologies_info = [f"{info['name']} ({info['abbr']})" for info in data_store["PathologiesPatient"].values()]
    prompt_info = ", ".join(pathologies_info)
    
    # Use the patient-specific output instruction
    output_instruction = data_store["OutputFormats"]["patient"]["output"]
    specific_prompt = f"Given the following pathologies: {prompt_info}, {output_instruction}\n\nReport: {report_text}\n\n"
    enhanced_prompt = append_guidance_if_needed(report_text, specific_prompt)
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": enhanced_prompt},
            {"role": "system", "content": "You are a medical expert, confident in the interpretation of radiology reports."}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        stop=["END OF LIST"]
    )
    analysis_result = response.choices[0].message.content.strip()
    parse_and_update_patient_output(analysis_result, data_store, df, index)




In [None]:
save_path = # TODO: define .xlsx path to save data
df.to_excel(save_path, index=False)