In [1]:
import re
import pandas as pd
import math

from google.cloud import bigquery
bigquery_client = bigquery.Client()

import synapseclient
syn = synapseclient.Synapse()
syn.login()

Welcome, Adam Taylor!



In [2]:
test_synid = 'syn24829433'

In [3]:
#D Define some helper functions

def days_to_age (days):
    age = math.floor(int(days)/365)
    return(age)

days_to_age(10000)

from math import log, floor


def human_format(number):
    units = ['', 'K', 'M', 'G', 'T', 'P']
    k = 1000.0
    magnitude = int(floor(log(number, k)))
    return '%.2f%s' % (number / k**magnitude, units[magnitude])

def first_lower(s):
    if not s: # Added to handle case where s == None
        return 
    else:
        return s[0].lower() + s[1:]

In [8]:
# Define our main orration function

def orate(synid: str):
    '''Takes a Synapse ID of a HTAN Data File and returns a natural language description of the dataset'''
    # Get annotations
    annotations = syn.get_annotations(test_synid)

    pixels = int(annotations['SizeX'][0])*int(annotations['SizeY'][0])*int(annotations['SizeZ'][0])

    # Get biospecimen information
    biospecimen_id = annotations['HTANParentBiospecimenID'][0]
    print(f'Getting parent biospecimen information from {biospecimen_id}')

    biospecimen_query = f"""
    SELECT * FROM `htan-dcc.combined_assays.Biospecimen`
    WHERE HTAN_Biospecimen_ID = '{biospecimen_id}'
    """

    biospecimen = bigquery_client.query(biospecimen_query).to_dataframe().to_dict()


    # Extract particpant ID
    participant_id = re.match('HTA\d+\_\d+', biospecimen_id).group(0)


    # Get demographics information
    print(f'Getting demographics information from {participant_id}')

    demographics_query = f"""
    SELECT * FROM `htan-dcc.combined_assays.Demographics`
    WHERE HTAN_Participant_Id = '{participant_id}'
    """

    demographics = bigquery_client.query(demographics_query).to_dataframe().to_dict()


    # Get diagnosis information

    print(f'Getting diagnosis information from {participant_id}')

    diagnosis_query = f"""
    SELECT * FROM `htan-dcc.combined_assays.Diagnosis`
    WHERE HTAN_Participant_Id = '{participant_id}'
    """

    diagnosis = bigquery_client.query(diagnosis_query).to_dataframe().to_dict()

    oration =  (
        f"{annotations['HTANDataFileID'][0]} is a {annotations['ImagingAssayType'][0]} image of a {first_lower(biospecimen['Acquisition_Method_Type'][0])} "
        f"(Biospecimen {biospecimen_id}) "
        f"from a {days_to_age(biospecimen['Collection_Days_from_Index'][0])} {demographics['Gender'][0]} "
        f"(Participant {participant_id}) "
        f"diagnosed with {first_lower(diagnosis['Primary_Diagnosis'][0])}. "
        f"The image contains {annotations['SizeC'][0]} channels, approximately {human_format(pixels)} pixels, and measures "
        f"{math.ceil(int(annotations['SizeX'][0])*float(annotations['PhysicalSizeX'][0]))} {annotations['PhysicalSizeXUnit'][0]} wide "
        f"by {math.ceil(int(annotations['SizeY'][0])*float(annotations['PhysicalSizeY'][0]))} {annotations['PhysicalSizeYUnit'][0]} high"

    )
    return(oration)


In [9]:
orate(test_synid)

Getting parent biospecimen information from HTA9_1_17
Getting demographics information from HTA9_1
Getting diagnosis information from HTA9_1


'HTA9_1_19362 is a mIHC image of a biopsy (Biospecimen HTA9_1_17) from a 70 female (Participant HTA9_1) diagnosed with infiltrating duct carcinoma NOS. The image contains 12 channels, approximately 8.96M pixels, and measures 1939 µm wide by 1157 µm high'