In [None]:
# !pip install pandas
# !pip install numpy
# !pip install pydicom
# !pip install matplotlib
# !pip install pillow
# !pip install dotenv


# %cd /home/azureuser/cloudfiles/code/rwep_experiments/jsam/mtb_sample_data/libs/sam_mtb_utils/
# !pip install -e .

/mnt/batch/tasks/shared/LS_root/mounts/clusters/alyssa-test/code/rwep_experiments/jsam/mtb_sample_data/libs/sam_mtb_utils
Obtaining file:///mnt/batch/tasks/shared/LS_root/mounts/clusters/alyssa-test/code/rwep_experiments/jsam/mtb_sample_data/libs/sam_mtb_utils
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: sam_mtb_utils
  Building editable for sam_mtb_utils (pyproject.toml) ... [?25ldone
[?25h  Created wheel for sam_mtb_utils: filename=sam_mtb_utils-0.1.0-0.editable-py3-none-any.whl size=2037 sha256=d7ffc77aaf5af764fd54f006bd2255ad5e1818b7ab60921847058e001d7b74ee
  Stored in directory: /tmp/pip-ephem-wheel-cache-pdx4ljv3/wheels/66/b4/58/e314f37d8fadc3229a2a97dd2e52adce692e94c0dab12e7677
Successfully built sam_mtb_utils
Installing collected packa

In [2]:
import pandas as pd
from pathlib import Path

import numpy as np


def read_parquet_from_directory(parquet_dir):
    """
    Reads the Parquet file from the specified directory.

    Args:
        base_path (str): The base path where the directories are located.
        directory_name (str): The name of the directory containing the Parquet file.

    Returns:
        pd.DataFrame: The data from the Parquet file as a pandas DataFrame.
    """
    file_frames = []
    # parquet_dir = base_path / directory_name
    for file in Path(parquet_dir).glob("*.parquet"):
        df = pd.read_parquet(file)
        file_frames.append(df)
    if file_frames:
        return pd.concat(file_frames, ignore_index=True)
    raise FileNotFoundError(f"No Parquet files found in {parquet_dir}")

In [4]:
diagnosis = "/home/azureuser/data/rwesdataforge/protege/2025-06-16-clinical_cohort_2196total+mtb_pathnotes/protege/clinical-cohort/structured-ehr/DD48_DIAGNOSIS"
df = read_parquet_from_directory(diagnosis)
lung_patients = df[df['DX_CODE'] == 'C34']

# Then exclude small cell lung cancer from histology
nsclc_patients1 = lung_patients[
    ~lung_patients['HISTOLOGY'].str.contains('small cell', case=False, na=False)
]

print(f"Number of patients with non-small cell lung cancer: {len(nsclc_patients1)}")
target_patient_ids = nsclc_patients1['PTID'].unique()

Number of patients with non-small cell lung cancer: 1464


In [5]:
parquet_dir="/home/azureuser/data/rwesdataforge/protege/2025-06-16-clinical_cohort_2196total+mtb_pathnotes/protege/clinical-cohort/structured-ehr/"

# Initialize variables for NOTES and TREATMENT dataframes
NOTES = None
TREATMENT = None

if not Path(parquet_dir).exists():
    raise FileNotFoundError(f"The directory {parquet_dir} does not exist.")

for subdir in Path(parquet_dir).iterdir():
    if subdir.is_dir():
        name = str(subdir).split('/')[-1]
        
        # Only process NOTES and TREATMENT directories
        if "NOTES" not in name and "TREATMENT" not in name:
            continue
            
        print(f"Reading Parquet files from {subdir}")
        try:
            file_frames = read_parquet_from_directory(subdir)
            # Filter the DataFrame to include only target patients
            file_frames = file_frames[file_frames['PTID'].isin(target_patient_ids)]
            
            # Collapse the dataframe so that there is only one row per patient, combining all columns   
            file_frames = file_frames.groupby('PTID').agg(
                lambda x: [val for val in x.dropna() if pd.notna(val)] or np.nan
            ).reset_index()
            
            # Add file name to the data frame columns as an "_name" suffix
            file_frames.columns = [f"{col}_{name}" if col != 'PTID' else col for col in file_frames.columns]
            
            if file_frames.empty:
                print(f"No relevant data found in {subdir} for target patients.")
                continue
            
            # Store the specific dataframes we want
            if "TREATMENT" in name:
                TREATMENT = file_frames.copy()
                print(f"Successfully read TREATMENT data: {len(file_frames)} rows")
            elif "NOTES" in name:
                NOTES = file_frames.copy()
                print(f"Successfully read NOTES data: {len(file_frames)} rows")
                
        except FileNotFoundError as e:
            print(e)

# Verify we got both dataframes
if NOTES is None:
    print("Warning: NOTES dataframe was not found")
if TREATMENT is None:
    print("Warning: TREATMENT dataframe was not found")

Reading Parquet files from /home/azureuser/data/rwesdataforge/protege/2025-06-16-clinical_cohort_2196total+mtb_pathnotes/protege/clinical-cohort/structured-ehr/DD48_NOTES
Successfully read NOTES data: 1451 rows
Reading Parquet files from /home/azureuser/data/rwesdataforge/protege/2025-06-16-clinical_cohort_2196total+mtb_pathnotes/protege/clinical-cohort/structured-ehr/DD48_TREATMENT
Successfully read TREATMENT data: 1451 rows


In [6]:
# ANALYSIS OF TOTAL AMOUNT OF NOTES IN A RECORD
print("=" * 60)
print("üìä CLINICAL NOTES ANALYSIS")
print("=" * 60)

notes_series = NOTES["DEID_NOTE_TXT_DD48_NOTES"].dropna()
print(f"üîç Analyzing notes from {len(notes_series)} patients with available note data")

# Count number of non-empty, stripped items in each comma-separated string
note_lengths = notes_series.apply(lambda x: len([i.strip() for i in x if i.strip()]))

# Print summary stats with improved formatting
note_stats = {
    "Total Patients": len(note_lengths),
    "Mean Notes/Patient": f"{note_lengths.mean():.2f}",
    "Std Deviation": f"{note_lengths.std():.2f}",
    "Minimum Notes": int(note_lengths.min()),
    "25th Percentile": f"{note_lengths.quantile(0.25):.1f}",
    "Median Notes": f"{note_lengths.median():.1f}",
    "75th Percentile": f"{note_lengths.quantile(0.75):.1f}",
    "Maximum Notes": int(note_lengths.max()),
    "Patients w/ Zero Notes": f"{(note_lengths == 0).sum()} ({(note_lengths == 0).sum()/len(note_lengths)*100:.1f}%)"
}

print("\nüìà SUMMARY STATISTICS:")
print("-" * 40)
for key, value in note_stats.items():
    print(f"  {key:<20}: {value}")

# Additional insights
total_notes = note_lengths.sum()
patients_with_notes = (note_lengths > 0).sum()
avg_notes_excluding_zeros = note_lengths[note_lengths > 0].mean() if patients_with_notes > 0 else 0

print(f"\nüí° KEY INSIGHTS:")
print("-" * 40)
print(f"  Total Notes Across All Patients: {total_notes:,}")
print(f"  Patients with Notes Available: {patients_with_notes} ({patients_with_notes/len(note_lengths)*100:.1f}%)")
if patients_with_notes > 0:
    print(f"  Avg Notes (excluding zero counts): {avg_notes_excluding_zeros:.2f}")

print("=" * 60)

üìä CLINICAL NOTES ANALYSIS
üîç Analyzing notes from 1451 patients with available note data

üìà SUMMARY STATISTICS:
----------------------------------------
  Total Patients      : 1451
  Mean Notes/Patient  : 61.50
  Std Deviation       : 57.63
  Minimum Notes       : 1
  25th Percentile     : 23.0
  Median Notes        : 46.0
  75th Percentile     : 80.0
  Maximum Notes       : 469
  Patients w/ Zero Notes: 0 (0.0%)

üí° KEY INSIGHTS:
----------------------------------------
  Total Notes Across All Patients: 89,234
  Patients with Notes Available: 1451 (100.0%)
  Avg Notes (excluding zero counts): 61.50


In [7]:
# Reorder every list in each column by ascending order of NOTE_INTERVAL_DD48_NOTES
def reorder_notes_by_interval(notes_df):
    intervals = notes_df["NOTE_INTERVAL_DD48_NOTES"]
    idx_sorted = [np.argsort(interval) for interval in intervals]
    reordered = notes_df.copy()
    for col in notes_df.columns:
        if col == "PTID":
            continue
        reordered[col] = [
            [row[i] for i in idx] if isinstance(row, list) and len(row) == len(idx) else row
            for row, idx in zip(notes_df[col], idx_sorted)
        ]
    return reordered

NOTES_reordered = reorder_notes_by_interval(NOTES)
print(NOTES_reordered.columns)
# Combine lists into comma-separated strings for all columns except PTID
for col in NOTES_reordered.columns:
    if col != "PTID":
        NOTES_reordered[f"{col}_str"] = NOTES_reordered[col].apply(lambda x: ", ".join(map(str, x)) if isinstance(x, list) else x)

# Sort by length of string in DEID_NOTE_TXT_DD48_NOTES
NOTES_reordered_sorted = NOTES_reordered.sort_values(by="DEID_NOTE_TXT_DD48_NOTES_str", key=lambda x: x.str.len(), ascending=True)

Index(['PTID', 'NOTE_INTERVAL_DD48_NOTES', 'NOTE_TYPE_DD48_NOTES',
       'NOTE_SUBTYPE_DD48_NOTES', 'ENCOUNTER_ID_DD48_NOTES',
       'DEID_NOTE_TXT_DD48_NOTES'],
      dtype='object')


In [8]:
import time

print("\n" + "=" * 50)
print("üìè NOTE CHARACTER LENGTH STATISTICS")
print("=" * 50)

start_time = time.time()

# Extract the column
notes_column = NOTES_reordered_sorted["DEID_NOTE_TXT_DD48_NOTES"]

# Step 1: Data availability check
t0 = time.time()
total_entries = len(notes_column)
non_null_entries = notes_column.notna().sum()
print(f"üîç Step 1: Found {non_null_entries}/{total_entries} entries with notes ({non_null_entries / total_entries * 100:.1f}%)")
print(f"‚è±Ô∏è Step 1 time: {time.time() - t0:.2f}s\n")

if non_null_entries == 0:
    print("‚ö†Ô∏è No non-null entries. Exiting analysis.")
else:
    # Step 2: Character length computation
    t1 = time.time()
    char_lengths = notes_column.dropna().apply(lambda x: len(str(x)))
    print(f"üßÆ Step 2: Computed character lengths for {len(char_lengths)} entries")
    print(f"‚è±Ô∏è Step 2 time: {time.time() - t1:.2f}s\n")

    # Step 3: Summary statistics
    t2 = time.time()
    print("üìä Character Length Stats:")
    print(f"   Mean:   {char_lengths.mean():.1f}")
    print(f"   Median: {char_lengths.median():.1f}")
    print(f"   Std:    {char_lengths.std():.1f}")
    print(f"   Min:    {char_lengths.min():,}")
    print(f"   Max:    {char_lengths.max():,}")
    print(f"   25%:    {char_lengths.quantile(0.25):.1f}")
    print(f"   75%:    {char_lengths.quantile(0.75):.1f}")
    print(f"‚è±Ô∏è Step 3 time: {time.time() - t2:.2f}s\n")

print(f"üèÅ Total analysis time: {time.time() - start_time:.2f} seconds")
print("=" * 50)



üìè NOTE CHARACTER LENGTH STATISTICS
üîç Step 1: Found 1451/1451 entries with notes (100.0%)
‚è±Ô∏è Step 1 time: 0.00s



üßÆ Step 2: Computed character lengths for 1451 entries
‚è±Ô∏è Step 2 time: 5.37s

üìä Character Length Stats:
   Mean:   395855.2
   Median: 287123.0
   Std:    367090.6
   Min:    4,542
   Max:    2,818,715
   25%:    141392.5
   75%:    527567.0
‚è±Ô∏è Step 3 time: 0.00s

üèÅ Total analysis time: 5.38 seconds


In [9]:
# ANALYSIS OF INTERVALS OF PATIENT NOTES
print("\n" + "=" * 50)
print("üìÖ PATIENT NOTES INTERVAL ANALYSIS")
print("=" * 50)

interval_series = NOTES_reordered_sorted["NOTE_INTERVAL_DD48_NOTES_str"].dropna().astype(str)

# Convert to lists of floats, ignoring empty values
interval_lists = interval_series.apply(
    lambda x: [float(i.strip()) for i in x.split(',') if i.strip()]
)

# Compute max - min (range) per row
interval_diffs = interval_lists.apply(lambda x: max(x) - min(x) if len(x) > 1 else 0)

# Patient distribution
single_note_patients = (interval_lists.apply(len) <= 1).sum()
multi_note_patients = (interval_lists.apply(len) > 1).sum()

print(f"üìä Data: {multi_note_patients}/{len(interval_series)} patients with multiple notes ({multi_note_patients/len(interval_series)*100:.0f}%)")
print(f"üìà Spans: Mean={interval_diffs.mean():.0f} days, Median={interval_diffs.median():.0f} days, Max={interval_diffs.max():.0f} days")

if multi_note_patients > 0:
    multi_note_spans = interval_diffs[interval_diffs > 0]
    avg_years = multi_note_spans.mean() / 365.25
    max_years = interval_diffs.max() / 365.25
    
    # Time span categories
    short_term = (multi_note_spans <= 30).sum()
    long_term = (multi_note_spans > 365).sum()
    
    print(f"‚è±Ô∏è  Follow-up: Avg={avg_years:.1f} years, Max={max_years:.1f} years")
    print(f"üéØ Duration: {short_term} short-term (‚â§30d), {long_term} long-term (>1yr)")

print("=" * 50)


üìÖ PATIENT NOTES INTERVAL ANALYSIS
üìä Data: 1447/1451 patients with multiple notes (100%)
üìà Spans: Mean=726 days, Median=406 days, Max=3793 days
‚è±Ô∏è  Follow-up: Avg=2.0 years, Max=10.4 years
üéØ Duration: 66 short-term (‚â§30d), 768 long-term (>1yr)


In [10]:
# ANALYSIS OF TREATMENT DATA
print("\n" + "=" * 50)
print("üíä ANTINEOPLASTIC TREATMENT ANALYSIS")
print("=" * 50)

# Data overview
total_patients = len(TREATMENT)
patients_with_treatment_data = TREATMENT["ANTINEOPLASTIC_DD48_TREATMENT"].notna().sum()

# Filter for non-empty treatments
nonzero_mask = TREATMENT["ANTINEOPLASTIC_DD48_TREATMENT"].dropna().apply(
    lambda x: len([i.strip() for i in x if i.strip()]) > 0
)
TREATMENT_filtered = TREATMENT.loc[TREATMENT["ANTINEOPLASTIC_DD48_TREATMENT"].notna()]
TREATMENT_filtered = TREATMENT_filtered[nonzero_mask]
patients_with_actual_treatments = len(TREATMENT_filtered)

print(f"üìä Data: {patients_with_actual_treatments}/{total_patients} patients with treatments ({patients_with_actual_treatments/total_patients*100:.1f}%)")

if patients_with_actual_treatments > 0:
    # Count treatments per patient
    list_lengths = TREATMENT_filtered["ANTINEOPLASTIC_DD48_TREATMENT"].apply(
        lambda x: len([i.strip() for i in x if i.strip()])
    )

    # Key statistics
    print(f"üìà Stats: Mean={list_lengths.mean():.1f}, Median={list_lengths.median():.1f}, Max={list_lengths.max()}")
    
    # Treatment patterns
    single = (list_lengths == 1).sum()
    multiple = (list_lengths > 1).sum()
    complex_cases = (list_lengths > 5).sum()
    
    print(f"üéØ Patterns: {single} single ({single/len(list_lengths)*100:.0f}%), {multiple} multiple ({multiple/len(list_lengths)*100:.0f}%), {complex_cases} complex (6+)")
    print(f"üíä Total treatments: {list_lengths.sum():,}")

else:
    print("‚ö†Ô∏è  No treatment data found")

print("=" * 50)


üíä ANTINEOPLASTIC TREATMENT ANALYSIS
üìä Data: 608/1451 patients with treatments (41.9%)
üìà Stats: Mean=2.8, Median=3.0, Max=8
üéØ Patterns: 93 single (15%), 515 multiple (85%), 22 complex (6+)
üíä Total treatments: 1,717


In [11]:
notes_ordered_reindexed = NOTES_reordered_sorted.reset_index(drop=False)
output_path = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/patient_notes_raw_split.csv"
notes_ordered_reindexed.to_csv(output_path, index=False)
print(f"‚úÖ Saved to {output_path}")



‚úÖ Saved to /home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/patient_notes_raw_split.csv


In [13]:
import os
df_original = pd.read_csv("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/patient_notes_raw.csv", index_col=0)  # make sure index_col is patient ID
df_original.reset_index(inplace=True)  # Reset index to have PTID as a column
id_to_ptid_map = df_original["PTID"].to_dict() # Assuming index is patient ID

# Step 2: Get consistent and inconsistent patient IDs (from filenames)
def get_patient_ids_from_path(path, keyword="patient_", suffix=".json"):
    return {
        fname.split(".")[0].replace(keyword, "")
        for fname in os.listdir(path)
        if fname.startswith(keyword) and fname.endswith(suffix)
    }

consist_path = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/old/consistency_bench/consistency_bench_gpt-4.1/consistent"
inconsist_path = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/old/consistency_bench/consistency_bench_gpt-4.1/inconsistent"

consist_ids = get_patient_ids_from_path(consist_path)
inconsist_ids = get_patient_ids_from_path(inconsist_path)

# Step 3: Combine all relevant IDs
relevant_ids = consist_ids.union(inconsist_ids)
print(f"Found {len(relevant_ids)} relevant patient IDs from consistency benchmark files.")
relevant_ptids = {id_to_ptid_map[int(pid)] for pid in relevant_ids}

df_original.reset_index(inplace=True) 
filtered_df = df_original[df_original["PTID"].isin(relevant_ptids)]

Found 360 relevant patient IDs from consistency benchmark files.


In [14]:
df_split = pd.read_csv("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/patient_notes_raw_split.csv", index_col=0)  # make sure index_col is patient ID
filtered_df_split = df_split[df_split["PTID"].isin(relevant_ptids)]
assert set(filtered_df["PTID"]) == set(filtered_df_split["PTID"]), "Mismatch in PTID values between dataframes"

In [15]:
import ast

def parse_list_str(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return x  # return original if parsing fails
    else:
        return x

filtered_df_split["DEID_NOTE_TXT_DD48_NOTES"] = filtered_df_split["DEID_NOTE_TXT_DD48_NOTES"].apply(parse_list_str)
filtered_df_split["NOTE_TYPE_DD48_NOTES"]= filtered_df_split["NOTE_TYPE_DD48_NOTES"].apply(parse_list_str)
filtered_df_split["NOTE_INTERVAL_DD48_NOTES"] = filtered_df_split["NOTE_INTERVAL_DD48_NOTES"].apply(parse_list_str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_split["DEID_NOTE_TXT_DD48_NOTES"] = filtered_df_split["DEID_NOTE_TXT_DD48_NOTES"].apply(parse_list_str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_split["NOTE_TYPE_DD48_NOTES"]= filtered_df_split["NOTE_TYPE_DD48_NOTES"].apply(parse_list_str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [16]:
# mercor_ids_consistent = get_patient_ids_from_path("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/Mercor/consistent", "note_", ".txt")
    # mercor_ids_inconsistent = get_patient_ids_from_path("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/Mercor/inconsistent", "note_", ".txt")
    # mercor_ids = mercor_ids_consistent.union(mercor_ids_inconsistent)
    # print(f"Found {len(mercor_ids)} Mercor patient IDs")
    # centaur_ids = get_patient_ids_from_path("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/Centaur/converted_json", "patient_", ".json")
# print(f"Found {len(centaur_ids)} Centaur patient IDs")
linkedinIDs= get_patient_ids_from_path("/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/data_labelling/LinkedIn", "patient_", ".txt")
print(f"Found {len(linkedinIDs)} LinkedIn patient IDs")


Found 21 LinkedIn patient IDs


In [17]:
from datetime import datetime, timedelta

def add_days_to_iso8601(date_str: str, days_to_add: int) -> str:
    # Parse the ISO 8601 date string
    date_obj = datetime.fromisoformat(date_str)
    
    # Add days
    new_date = date_obj + timedelta(days=days_to_add)
    
    # Return in ISO 8601 format
    return new_date.isoformat()

In [18]:
import re
from datetime import datetime, timedelta

def replace_date_tokens(text: str, base_date: str) -> str:
    """
    Replace [DATE: <int>] tokens in the text with an actual date based on base_date + int days.

    Args:
        text (str): The input text containing [DATE: <int>] tokens.
        base_date (datetime): The reference date to compute new dates from.

    Returns:
        str: The text with [DATE: <int>] replaced by actual dates in YYYY-MM-DD format.
    """
    base_date = datetime.fromisoformat(base_date)
    def replacer(match):
        offset = int(match.group(1))
        new_date = base_date + timedelta(days=offset)
        return f"[DATE: {new_date.strftime('%Y-%m-%d')}]"

    return re.sub(r"\[DATE:\s*(-?\d+)\]", replacer, text)

In [19]:
import json

filtered_df_split.reset_index(drop=True, inplace=True)  # Reset index to avoid issues with JSON serialization
mercor_location = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/Mercor/patient_viewer"
centaur_location = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/Centaur/patient_viewer"
linkedin_location = "/home/azureuser/cloudfiles/code/rwep_experiments/alyssa/RLFollow_clean/data/data_labelling/LinkedIn"
mercor_ids=[]
centaur_ids=[]
for i in range(360):
    final_location = None
    if str(i) in mercor_ids:
        final_location = f"{mercor_location}/patient_{i}.json"
    elif str(i) in centaur_ids:
        final_location = f"{centaur_location}/patient_{i}.json"
    elif str(i) in linkedinIDs:
        final_location = f"{linkedin_location}/patient_{i}.json"
    else:
        print(f"Patient ID {i} not found in Mercor or Centaur datasets, skipping...")
        continue
    patient = filtered_df_split.iloc[i]
    index = filtered_df.index[filtered_df['PTID'] == patient['PTID']][0]
    assert index == i, f"Index mismatch for patient {patient['PTID']}: expected {i}, got {index}"
    # Create a JSON object for each patient
    total_notes = len(patient["DEID_NOTE_TXT_DD48_NOTES"])
    all_jsons = []
    date_original = "1880-01-01"
    for j in range(total_notes):
        ptid = patient["PTID"]+"_"+str(j)
        type = patient["NOTE_TYPE_DD48_NOTES"][j]
        date = add_days_to_iso8601(date_original, patient["NOTE_INTERVAL_DD48_NOTES"][j])
        text = patient["DEID_NOTE_TXT_DD48_NOTES"][j]
        text=replace_date_tokens(text,date_original)
        patient_json = {"id": ptid, "type": type, "date": date, "title": "patient notes", "text": text}
        all_jsons.append(patient_json)
    
# Save the JSON objects to a file or process them as needed
    # For example, you can save them to a file named "patient_notes.json"
    with open(final_location, 'w') as json_file:
        json.dump(all_jsons, json_file, indent=4)
    print(f"Saved patient {patient['PTID']} notes to {final_location}")


Patient ID 0 not found in Mercor or Centaur datasets, skipping...
Patient ID 1 not found in Mercor or Centaur datasets, skipping...
Patient ID 2 not found in Mercor or Centaur datasets, skipping...
Patient ID 3 not found in Mercor or Centaur datasets, skipping...
Patient ID 4 not found in Mercor or Centaur datasets, skipping...
Patient ID 5 not found in Mercor or Centaur datasets, skipping...
Patient ID 6 not found in Mercor or Centaur datasets, skipping...
Patient ID 7 not found in Mercor or Centaur datasets, skipping...
Patient ID 8 not found in Mercor or Centaur datasets, skipping...
Patient ID 9 not found in Mercor or Centaur datasets, skipping...
Patient ID 10 not found in Mercor or Centaur datasets, skipping...
Patient ID 11 not found in Mercor or Centaur datasets, skipping...
Patient ID 12 not found in Mercor or Centaur datasets, skipping...
Patient ID 13 not found in Mercor or Centaur datasets, skipping...
Patient ID 14 not found in Mercor or Centaur datasets, skipping...
Patie