In [None]:
import pandas as pd
import re
import ast
import time
import datetime
from shared_functions import get_access_token, call_gpt_with_prompt

## Helper functions

In [None]:
# string_to_date_convertor() can convert differently formatted string dates to datetime 
def string_to_date_convertor(string):
    if re.match(r'\d{1,2}\/\d{1,2}\/\d{4}', string, re.IGNORECASE):
        datetime_from_string = datetime.datetime.strptime(string, '%m/%d/%Y')
    elif re.match(r'\d{1,2}-\d{1,2}-\d{4}', string, re.IGNORECASE):
        datetime_from_string = datetime.datetime.strptime(string, '%m-%d-%Y')
    elif re.match(r'\d{1,2}-\d{1,2}\/\d{4}', string, re.IGNORECASE):
        datetime_from_string = datetime.datetime.strptime(string, '%m-%d/%Y')
    elif re.match(r'\d{1,2}\/\d{1,2}\/\d{1,2}', string, re.IGNORECASE):
        try:
            datetime_from_string = datetime.datetime.strptime(string, '%m/%d/%y')
        except:
            datetime_from_string = datetime.datetime.strptime(string, '%y/%m/%d')
    elif re.match(r'\d{1,2}-\d{1,2}-\d{1,2}', string, re.IGNORECASE):
        try:
            datetime_from_string = datetime.datetime.strptime(string, '%m-%d-%y')
        except:
            datetime_from_string = datetime.datetime.strptime(string, '%y-%m-%d')
    elif re.match(r'\d{1,2}-\d{1,2}\/\d{1,2}', string, re.IGNORECASE):
        datetime_from_string = datetime.datetime.strptime(string, '%y-%m/%d')
    elif re.match(r'\d{4}-\d{1,2}-\d{1,2}', string, re.IGNORECASE):
        datetime_from_string = datetime.datetime.strptime(string, '%Y-%m-%d')
    date_from_datetime = datetime_from_string.date()
    return date_from_datetime

In [None]:
# regex_search() searches for a specific string (regex_string) inside another string (original string)
def regex_search(regex_string, original_string):
    return re.search(regex_string, original_string, re.IGNORECASE)

In [None]:
# check_for_distant_mets() takes in a list of metastatic sites (which in this case is outputted by GPT-4 in a specific format) 
# returns True if there is >=1 distant metastatic site within that list
# returns False if there are no distant metastatic sites (only regional metastatic sites)
def check_for_distant_mets(llm_output):
    mets_list = llm_output.strip('[]').split(', ')
    distant_check = False
    for i in range(len(mets_list)):
        if not (regex_search('prost', mets_list[i]) or
                regex_search('seminal', mets_list[i]) or regex_search('ureter', mets_list[i]) or
                regex_search('urethra', mets_list[i]) or regex_search('rectum', mets_list[i]) or
                regex_search('bladder', mets_list[i]) or regex_search('levator', mets_list[i]) or
                (regex_search('pelvis', mets_list[i]) and regex_search('lymph', mets_list[i])) or
                (regex_search('pelvis', mets_list[i]) and regex_search('node', mets_list[i])) or
                (regex_search('pelvic', mets_list[i]) and regex_search('wall', mets_list[i])) or
                (regex_search('pelvic', mets_list[i]) and regex_search('node', mets_list[i])) or 
                (regex_search('pelvic', mets_list[i]) and regex_search('nodal', mets_list[i])) or
                regex_search('vesical', mets_list[i]) or
                regex_search('obturator', mets_list[i]) or regex_search('rectal', mets_list[i]) or
                (regex_search('iliac', mets_list[i]) and not regex_search('common', mets_list[i]) and not regex_search('bone', mets_list[i]) and not 
                regex_search('crest', mets_list[i]) and not regex_search('sacro', mets_list[i]) and not regex_search('joint', mets_list[i]) and not
                regex_search('wing', mets_list[i])) or
                regex_search('hypogastric', mets_list[i]) or
                regex_search('sacral', mets_list[i]) or regex_search('promontory', mets_list[i]) or
                regex_search('gerota', mets_list[i]) or
                regex_search('nodal', mets_list[i]) or regex_search('pelvic adenopathy', mets_list[i])):
            distant_check = True
            break
    return distant_check

## Identifying the index metastasis report and determining timing classification

### __Dataframe creation__

Prior to this step, we ran various SQL queries to pull patient data from the Mayo Clinic Cloud platform, through which we obtained
1) PCa Diagnosis Date from a diagnostic pathology report
2) All eligible [CT, NM Bone] radiology reports (within 45 days before and any time after the date of the diagnostic pathology report)

With this information, we created a dataframe __rad_report_df__, in which each row contains respective patient information and one radiology report. For example, if a patient had 2 radiology reports, that patients would have two rows in the df. For each patient that has multiple radiology reports, the "Rad Report Text" column would be the only one that differs.

| Patient MRN | PCa Diagnosis Date | Rad Report Date | Rad Report Text|
|------|------|------|------|
| XXXXXXX | MM/DD/YYYY | MM/DD/YYYY | "Rad report text"|

__Patient MRN__: Patient identifier

__PCa Diagnosis Date__: Date of prostate cancer diagnosis

__Rad Report Date__: Date of radiology report

__Rad Report Text__: Text of radiology report

### __Index metastasis identification and timing classification:__

For each patient, the index metastasis report is the first eligible radiology report that shows evidence of distant metastasis from prostate cancer. The date of this report is indicated as the "index metastasis date".

In order to find the index metastasis report, we looped through the rad_report_df to check the radiology report texts in chronological order for each patient. We did this via a two step pipeline.

The first step utilized GPT-4 zero-shot prompting to extract all sites of metastasis (regional and distant) from a report in the form of a list. 

In the second step, this list is fed to a rule-based programming algorithm to check for presence of distant metastasis specifically.

The first report to show distant metastasis is then the index metastasis report, and the loop moves on to the next patient.

Once we know the index metastasis date, we can compare to the PCa diagnosis date in order to obtain the timing classification. We used a simple rule-based programming algorithm in which if metastasis occurred within 6 months of diagnosis, timing is __synchronous__. If metastasis occurs later than 6 months after diagnosis, timing is __metachronous__.

We constructed a new empty dataframe called __timing_df__, which can be filled once we know from our pipeline, which radiology report for each patient is the index metastasis report.

| Patient MRN | PCa Diagnosis Date | Index Metastasis Date | Timing |
|------|------|------|------|
| XXXXXXX | MM/DD/YYYY | MM/DD/YYYY | S or M|

__Patient MRN__: Patient identifier

__PCa Diagnosis Date__: Date of prostate cancer diagnosis

__Index Metastasis Date__: Date of index metastasis report

__Timing__: S or M (synchronous or metachronous; determined using a quick rule-based programming algorithm incorporating diagnosis date and index metastasis date)

We fill in the timing_df using identify_metastasic_sites(), which incorporates both steps of the pipeline mentioned above.

In [None]:
def identify_metastatic_sites(rad_report_df):
    token = get_access_token()
    
    # Create empty timing_df to be filled in for each patient
    timing_df = pd.DataFrame(columns = ['Patient MRN', 'PCa Diagnosis Date', 'Index Metastasis Date', 'Timing'])
    
    # Initializing prev_patient_mrn as 0, since there are no previous patient mrns until we get to the second patient mrn in the dataframe 
    prev_patient_mrn = 0
    
    for index, row in rad_report_df.iterrows():
        
        patient_mrn = row['Patient MRN']
        cancer_dx_date = string_to_date_convertor(row['PCa Diagnosis Date'])
        rad_report_date = string_to_date_convertor(row['Rad Report Date'])
        
        if patient_mrn == prev_patient_mrn:
            continue

        report_string = row['Rad Report Text']

        prompt_string = """
        Based on the report, if there is prostate cancer metastasis, 
        output a list of the sites of metastasis (also referred to as lesion, adenopathy) in the following format: 
        ['site 1', 'site 2', 'site 3', 'site 4', 'site 5', ...].
        If there are no sites of prostate cancer metastasis, output N/A.
        If there is no information on prostate cancer metastasis, output N/A.
        Only add a site of metastasis to the list if the report mentions a high or definite likelihood of metastasis to that site.
        For each site, do not assume metastasis if there are words that indicate uncertainty, such as 'indeterminate', 'possible', 'may', 'could', 'potential'.
        For each site, do not assume metastasis if there is only uncertain indication of metastasis such as 'faint choline activity', 
        'nonspecific', 'increased uptake', 'prominent lymph node'.
        Strictly follow the given output formats. Do not provide any additional explanation.
        """
        
        system_content = "Being an oncologist with specific expertise in prostate cancer, identify presence of metastasis from prostate cancer."
        
        user_content = f"Here is a radiology report of a patient with prostate cancer: {report_string}. " \
        f"{prompt_string}"
                
        output = call_gpt_with_prompt(system_content, user_content, token)

        # Re-fetch access token in case it expires before the for loop completes.
        if output == "invalid response":
            time.sleep(2)
            token1 = get_access_token()
            output = call_gpt_with_prompt(system_content, user_content, token1)       
        
        if (output != 'N/A' and output != 'invalid response' and check_for_distant_mets(output)):
            if rad_report_date - cancer_dx_date > datetime.timedelta(days = 183):
                timing = 'M'
            else:
                timing = 'S'
            new_row = pd.DataFrame([{'Patient MRN': patient_mrn, 'PCa Diagnosis Date': cancer_dx_date, 'Index Metastasis Date': rad_report_date, 'Timing': timing}])
            timing_df = pd.concat([timing_df, new_row], ignore_index=True)
            prev_patient_mrn = patient_mrn 
            
    return timing_df