# Part II: Transform Imaging Data into OMOP CDM

### Prerequisites
* Download ODBC Driver 18 from web <https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver16>
* OMOP CDM instance with the connection string and authentication information. If your database access limits IP addresses, make sure to add your IP address before running the connection strings.
* Install SQL processing package, i.e., pyodbc.
* Files or database that non-image data is stored: demographic, clinical assessments, labs, visits, etc.
* Locate your imaging files: local paths, cloud storage services (e.g., Microsoft Azure)

In [None]:
import requests
import pandas as pd
import pydicom
from pathlib import Path
from urllib3.filepost import encode_multipart_formdata, choose_boundary
from azure.identity import DefaultAzureCredential

In [None]:
!pip install azure-cli

In [None]:
!az login

In [None]:
# Run the az command and capture its output
captured_output = !az account get-access-token --resource https://dicom.healthcareapis.azure.com --query accessToken --output tsv

# The output is a special IPython.utils.text.SList object which behaves like a list.
# If the command outputs a single line (like an access token), it will be the first element.
bearer_token = captured_output[2].strip()

In [None]:
# check connection
base_dicomweb_url = 'https://imgextpoc-adnidicom.dicom.azurehealthcareapis.com/v2'

headers = {
    'Accept': "application/json",
    'Authorization': 'Bearer ' + bearer_token
    }
dicomweb_study_query_url =  base_dicomweb_url + '/studies/'
response = requests.request("GET", dicomweb_study_query_url, data="", headers=headers)
response

In [None]:
# Extract images from DICOM server with default setting (return 100 studies)
def list_series_study_uids(base_url, headers):
    series_study_data = []

    # Endpoint for fetching a list of studies; this might need to be adjusted based on your server's API
    studies_url = f"{base_url}/studies"

    # Fetch the list of studies
    studies_response = requests.get(studies_url, headers=headers)
    if studies_response.status_code == 200:
        studies = studies_response.json()

        # Loop through each study and fetch series within it
        for study in studies:
            study_uid = study['0020000D']['Value'][0]  # Adjust based on actual response format

            # Endpoint for fetching series within a study; adjust as needed
            series_url = f"{base_url}/studies/{study_uid}/series"
            
            # Fetch the list of series in the current study
            series_response = requests.get(series_url, headers=headers)
            if series_response.status_code == 200:
                series_list = series_response.json()

                # Extract the Series Instance UID from each series and add to the list
                for series in series_list:
                    series_uid = series['0020000E']['Value'][0]  # Adjust based on actual response format
                    series_study_data.append({"StudyUID": study_uid, "SeriesUID": series_uid})

    # Convert the list of records to a DataFrame
    return pd.DataFrame(series_study_data)

In [None]:
def fetch_all_series_metadata(base_url, headers, series_study_df):
    series_metadata = []

    for index, row in series_study_df.iterrows():
        study_uid = row['StudyUID']
        series_uid = row['SeriesUID']

        # Proper URI to fetch series metadata
        metadata_url = f"{base_url}/studies/{study_uid}/series/{series_uid}/metadata"

        # Fetch metadata for the series
        response = requests.get(metadata_url, headers=headers)
        if response.status_code == 200:
            try:
                metadata = response.json()
                series_metadata.append(metadata)
            except ValueError:  # Includes JSONDecodeError
                print(f"Invalid JSON response for URL: {metadata_url}")
        else:
            print(f"Failed to fetch metadata for URL: {metadata_url}. Status code: {response.status_code}")

    return series_metadata

In [None]:
def metadata_to_df(series_metadata, selected_attributes):
    data = []

    for series in series_metadata:
        for item in series:  # Assuming 'series_metadata' is a list of lists or similar
            study_uid = item.get("0020000D", {}).get("Value", [""])[0]
            series_uid = item.get("0020000E", {}).get("Value", [""])[0]
            uid = f"study/{study_uid}/series/{series_uid}"

            # Iterate over each attribute in the item
            for tag, element in item.items():
                # Convert the tag to the desired format (e.g., '00080020' from '(0008,0020)')
                formatted_tag = tag.replace('(', '').replace(')', '').replace(',', '')

                # Check if the formatted tag is in the list of selected attributes
                if formatted_tag in selected_attributes:
                    # Extract the value of the attribute
                    value = element.get("Value", [""])[0]  # Simplification, might need adjustment

                    # Add the UID, attribute tag, and value to the data list
                    data.append({
                        "UID": uid,
                        "Attribute Tag": formatted_tag,
                        "Value": value
                    })

    # Convert the list of records to a DataFrame
    return pd.DataFrame(data)

In [None]:
headers = {
    'Authorization': 'Bearer ' + bearer_token,
    'Accept': 'application/dicom+json'
}
base_dicomweb_url = 'https://imgextpoc-adnidicom.dicom.azurehealthcareapis.com/v2'

series_study_df = list_series_study_uids(base_dicomweb_url, headers)

In [None]:
print(series_study_df.shape)
print(series_study_df.StudyUID.nunique(), series_study_df.SeriesUID.nunique())

In [None]:
series_metadata = fetch_all_series_metadata(base_dicomweb_url, headers, series_study_df)
series_metadata.head()

## Update Procedure_occurrence table

Required Fields: person_id, procedure_occurrence_id (PK), procedure_date, procedure_concept_id, procedure_type_concept_id
* Use patient ID (00100020) to get PTID then use registry_idmap to find the matching person_id
* Convert StudyDate (00080020) to procedure_date
* Study Description (00081030) to procedure_concept_id through Athena lookup - 36713262 (Brain MR)
* procedure_type_concept_id = 32817 - EHR
* Use Series UID to create procedure_occurrence_id

In [None]:
# Extract information from metadata
def fetch_patient_details(base_url, headers):
    patient_data = []

    # Endpoint for fetching a list of studies; this might need to be adjusted based on your server's API
    studies_url = f"{base_url}/studies"

    # Fetch the list of studies
    studies_response = requests.get(studies_url, headers=headers)
    if studies_response.status_code == 200:
        studies = studies_response.json()

        # Loop through each study and fetch metadata
        for study in studies:
            study_uid = study['0020000D']['Value'][0]  # Adjust based on actual response format

            # Endpoint for fetching metadata of a study; adjust as needed
            metadata_url = f"{base_url}/studies/{study_uid}/metadata"

            # Fetch metadata for the current study
            metadata_response = requests.get(metadata_url, headers=headers)
            if metadata_response.status_code == 200:
                metadata = metadata_response.json()

                # Extract Patient ID, Gender, and Birth Date from the study metadata
                patient_id = metadata[0].get('00100020', {}).get('Value', [None])[0]
                study_description = metadata[0].get('00081030', {}).get('Value', [None])[0]
                study_date = metadata[0].get('00080020', {}).get('Value', [None])[0]

                patient_data.append({
                    "StudyUID": study_uid,
                    "PatientID": patient_id,
                    "Study_Description": study_description,
                    "StudyDate": study_date
                })

    # Convert the list of records to a DataFrame
    return pd.DataFrame(patient_data)

In [None]:
# !!! redact before publishing !!!

# Create database connection
import pyodbc

driver = '{ODBC Driver 18 for SQL Server}'
server = 'tcp:ohdsicdm.database.windows.net'
database = 'ohdsicdm'
username = '<username>'
password = '<password>'

conn_str = f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password}'
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()

In [None]:
# Update PROCERURE_OCCURRENCE
sql = '''
    INSERT INTO dbo.procedure_occurrence (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date, procedure_type_concept_id) 
    VALUES (?,?,?,?,?)
    '''
for index, row in patient_details_df.iterrows():
    cursor.execute(sql, row['procedure_occurrence_id'], row['person_id'], row['procedure_concept_id'], row['procedure_date'], row['procedure_type_concept_id'])

conn.commit()

## Update Image_occurrence table

* PK, FK: image_occurrence_id, person_id, procedure_occurrence_id, visit_occurrence_id, 
* Concept IDs: anatomic_site_concept_id (0018,0015) - snomed, modality_concept_id (0008,0060)
* wadors_uri, local_path (NA), image_occurrence_date (0008,0022), image_study_UID (0020,000D), series_UID (0020,0010)

In [None]:
image_occurrence_staging = series_attributes[["Study_UID", "Series_UID"]].drop_duplicates()
image_occurrence_staging

In [None]:
image_occurrence_staging = image_occurrence_staging.merge(patient_details_df[["StudyUID", "person_id", "procedure_date", "procedure_occurrence_id"]], left_on= "study_uid", right_on = 'StudyUID', how = 'left')

# Image occurrence ID
image_occurrence_staging['image_occurrence_id'], _ = pd.factorize(image_occurrence_staging['series_uid'])
image_occurrence_staging['image_occurrence_id'] = image_occurrence_staging['image_occurrence_id'] + 1 

# WADORS-URI
base_dicom_uri = 'https://imgextpoc-adnidicom.dicom.azurehealthcareapis.com/v2'
image_occurrence_staging['wadors_uri'] = base_dicom_uri + '/studies/' + image_occurrence_staging['study_uid'] + '/series/' + image_occurrence_staging['series_uid']

# Others
image_occurrence_staging['anatomic_site_concept_id'] = 4119359
image_occurrence_staging['modality_concept_id'] = 4013636

image_occurrence_staging

In [None]:
# Update IMAGE_OCCURRENCE
sql = '''
    INSERT INTO dbo.image_occurrence (image_occurrence_id, person_id, procedure_occurrence_id, anatomic_site_concept_id, wadors_uri, image_occurrence_date, image_study_uid, image_series_uid, modality_concept_id) 
    VALUES (?,?,?,?,?,?,?,?,?)
    '''
for index, row in image_occurrence_staging.iterrows():
    cursor.execute(sql, row['image_occurrence_id'], row['person_id'], row['procedure_occurrence_id'],  row['anatomic_site_concept_id'], row['wadors_uri'], row['procedure_date'],
                    row['study_uid'], row['series_uid'], row['modality_concept_id'])

conn.commit()

## Update Measurement table

* Numeric and non-numeric values have different fields
* One Image_feature row has one Measurement row.
* Create one stage table for Measurement and Image_feature input data.
* Required fields: measurement_id, person_id, measurement_concept_id, measurement_date, measurement_type_concept_id, value_as_number, measurement_source_value 

## Update Image_feature table

* PK, FK: image_feature_id, person_id, image_occurrence_id, image_feature_event_id
* Clinical domain: image_feature_event_field_concept_id (1147330 = measurement table)
* image_feature_concept_id: matching tags from the staging table 
* image_feature_type_concept_id: acquisition parameter (add a new concept id? can I use the concept class id?)
* anatomic_site_concept_id: for acquisition parameter, same as (0018,0015) - snomed