In [1]:
## Importing data directly from the S3 buckets is not always useful because if the data is uploaded directly
## Instead of having semantically usefile organization, it basically places everything in a single flat directory
## that is basically just organized by a unique ID for the user - which is an integer, and then an alphanumeric string
## I also wind up importing too many non-imaging files and having a single directory with 80,000 files in it
## is unwieldy
from google.cloud import bigquery
from google.cloud import dlp_v2
import girder_client
import secrets as s
import os
from tqdm.autonotebook import tqdm

project_name = "htan-dcc"
%load_ext google.cloud.bigquery

gc = girder_client.GirderClient(apiUrl=s.apiUrl)
gc.authenticate(apiKey=s.dsaApiKey)



{'_id': '5ebee66965f992910fa19570'}

In [2]:
#!gcloud auth application-default login
## Creating a class for girder interactions that allows me to both cache folders and also if it's not cached
## to create it; most improtantly I am allowing you to specify an entire dierctory structure analogous to
## os.makedirs

class dsaFolderCache:
    def __init__(self, gc,debug=False):
        self.gc = gc
        self.folderCache = {}
        self.collectionCache = {}
        self.debug = debug
        self.cacheHits = 0  ## Generate stats to keep track cache hits
        self.totalLookups = 0
        self.apiCalls = 0
        
    def getFolderId( self, dsaFolderPath ):
        ### The dsaFolderPath includes both the collection and subsequent folders
        self.totalLookups+=1
        
        return self.lookupPathInCollection(dsaFolderPath,True)

    ## if nameNotFound essentially..
    def lookupPathInCollection( self, dsaFolderPath,createIfNotFound=False):
        # Check if it exists in the cache and return if find a hit
        if dsaFolderPath in self.folderCache:
            self.cacheHits+=1
            return self.folderCache[dsaFolderPath]
        
        #Now actually query the server to see if it exists..
        try:
            f = gc.get("/resource/lookup?path=/collection/%s" % dsaFolderPath)
            self.apiCalls+=1
            if f:
                self.folderCache[dsaFolderPath] = f
                return f
        except:
            if self.debug: print("Path does not exist",dsaFolderPath)

        if not createIfNotFound:
            return None
        
        #This will split off the last/top level folder and the other part is what is left
        parentPath,folderName = os.path.split(dsaFolderPath)
        if len(parentPath) < 2:
            ## This means I do NOT create a collection 
            return None
        
        parent = self.lookupPathInCollection(parentPath,createIfNotFound)
        if parent is None: # Means I couldn't create it so need to end
            return None
        
        try:
            f = gc.createFolder(parent['_id'],folderName,parentType=parent['_modelType'])
            self.apiCalls +=1
            self.folderCache[dsaFolderPath] = f
            return f
        except:
            #print("Path does not exist",dsaFolderPath)
            return None
        
            ### This is where I fail because I don't know who the child is..
            ## So I must create it...
            ### Must now try and create the path recursively...
            
## This time should now be using cache
dsaFldr = dsaFolderCache(gc,debug=False)


In [3]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
# Explicitly create a credentials object. This allows you to use the same
# credentials for both the BigQuery and BigQuery Storage clients, avoiding
# unnecessary API calls to fetch duplicate authentication tokens.
credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

# Make clients.
bqclient = bigquery.Client(credentials=credentials, project='htan-dcc',)
bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials)

In [4]:
# Download query results.image_data_bigJoin_T
query_string = """
SELECT  *
FROM `htan-dcc.metadata.image_data_bigJoin_T` 
"""

imageData_df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
)
print(len(imageData_df),"Images in the image_data_bigJoin_T view")

12703 Images in the image_data_bigJoin_T view


In [6]:
## Iterate through the dataframe and create/upload/update the metadata depend on the flags
## The target Collection in this case is 
tgtCollection = 'DCC_SyncBigQuery_ImageData'
tgtCollection_Id = '60bf88ae1e5e1639366f900b'

## The synapse file path does not include the HTAN_Center so I first create a top level folder
## for each HTAN_Center as well as the relevant subdirectories
## Adding the position=0 prevents the annoying behavior where it will sometimes add CR/LF
t = tqdm(imageData_df.itertuples(),position=0)

## I am going to iterate through this twice.. first I am going to make sure all the folder path structure exists
for r in t:
    t.set_description('Total Queries: %s\tTotal API Calls: %s\tTotal cache hits: %s' % (dsaFldr.totalLookups,dsaFldr.apiCalls,dsaFldr.cacheHits),refresh=True)
    fullPath = os.path.join(tgtCollection, r.HTAN_Center, os.path.dirname(r.Synapse_file_path))
    dsaFldr.getFolderId(fullPath)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
## Could combine with above, but in this loop I am going to actually check and see if a file exists and if it doesn't
## import it to the assestore
t = tqdm(imageData_df.itertuples(),position=0)

## I am going to iterate through this twice.. first I am going to make sure all the folder path structure exists
for r in t:
    #print(r)
    ## This time instead of passing the dirname, I pass the full Synapse_file_path
    fullPath =  os.path.join(tgtCollection, r.HTAN_Center, os.path.dirname(r.Synapse_file_path))
    fullPathwFile = os.path.join(tgtCollection, r.HTAN_Center, r.Synapse_file_path) # Also has the filename..
     
    try:
            f = gc.get("/resource/lookup?path=/collection/%s" % fullPathwFile)
    except:
            t.set_description("Uploading %s" % fullPathwFile,refresh=True)
            if r.HTAN_Center in HTAN_Center_to_DSAAssetStoreId:
               DSAassetStoreId =  HTAN_Center_to_DSAAssetStoreId[r.HTAN_Center]
            else:
                print("Assetstore note found for HTAN Center %s " % r.HTAN_Center)
                continue
            
            ## The first part of the importPath is actually the bucket name.. need to ge
            if r.HTAN_Center in HTAN_Center_to_DSAAssetStoreId:
                assetStoreData = HTAN_Center_to_DSAAssetStoreId[r.HTAN_Center]
            else:
                continue
            ## Need to strip off the bucket path prefix
            importPath = r.Bucket_url.replace( assetStoreData['bucketPrefix'],"")
            
            destinationFolder = dsaFldr.getFolderId(fullPath)
            try:
                gc.post("assetstore/%s/import?destinationId=%s&importPath=%s&destinationType=folder" % (assetStoreData['dsaId'],destinationFolder['_id'],importPath))
            except:
                print("Import failed for %s %s %s " % (assetStoreData['dsaId'],destinationFolder['_id'],importPath))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Import failed for 5fa998df51de21dd08ca7df6 60bfc20d1e5e1639366f9050 /3409258/b31ae5ac-cb1b-41be-8eed-63cad936f2c6/reg001_cyc004_ch004_CD4.tif 


In [45]:
imageData_df

Unnamed: 0,SynapseID,HTAN_Participant_ID,HTAN_Center,Ethnicity,Gender,Race,Vital_Status,HTAN_Parent_ID,HTAN_Biospecimen_ID,Site_of_Resection_or_Biopsy,...,PhysicalSizeX,PhysicalSizeXUnit,PhysicalSizeY,PhysicalSizeYUnit,Center,Synapse_file_path,File_size,Bucket_url,GCS_timestamp,S3_timestamp
0,syn24992982,HTA12_7,HTAN WUSTL,not hispanic or latino,male,black or african american,Alive,HTA12_7,HTA12_7_102,Head of pancreas,...,0.5053,µm,0.5053,µm,HTAN WUSTL,h_and_e_level_1/HT115P1 H5.svs,109626893,gs://htan-dcc-washu/h_and_e_level_1/HT115P1 H5...,2021-02-26T00:33:46Z,
1,syn24992984,HTA12_8,HTAN WUSTL,not hispanic or latino,male,white,Alive,HTA12_8,HTA12_8_101,Head of pancreas,...,0.5053,µm,0.5053,µm,HTAN WUSTL,h_and_e_level_1/HT121P1 H3.svs,62768131,gs://htan-dcc-washu/h_and_e_level_1/HT121P1 H3...,2021-02-26T00:33:54Z,
2,syn25701100,HTA12_4,HTAN WUSTL,not hispanic or latino,male,white,Alive,HTA12_4,HTA12_4_102,Head of pancreas,...,341,µm,313,µm,HTAN WUSTL,imc_level_2/batch_1_05102021/HT064B1_H3_A1_A4_...,11964392,gs://htan-dcc-washu/imc_level_2/batch_1_051020...,2021-05-10T18:37:20Z,
3,syn25075429,HTA13_1,HTAN TNP SARDANA,Not Reported,male,white,Alive,HTA13_1_1,HTA13_1_33,Cecum,...,0.6499999762,µm,0.6499999762,µm,HTAN TNP SARDANA,imaging_level_2/WD-76845-029.ome.tif,79860366424,s3://htan-dcc-tnp-sardana/imaging_level_2/WD-7...,,2021-05-13 22:43:00+00:00
4,syn25059248,HTA11_3361,HTAN Vanderbilt,not hispanic or latino,male,white,Alive,HTA11_3361,HTA11_3361_2000001011,Ascending colon,...,0.25,µm,0.25,µm,HTAN Vanderbilt,h_and_e_level_1/HTA11_3361_2000001011611003001...,20170818576,s3://htan-dcc-vanderbilt/3380824/ca48604d-c9ce...,,2021-05-14 04:24:59+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12698,syn24997215,HTA6_1094,HTAN Duke,not hispanic or latino,female,white,Alive,HTA6_1094,HTA6_1094_1,Breast NOS,...,0.488,µm,0.488,µm,HTAN Duke,mibi_imaging_level_1/raw_tifs_masks/Point4409_...,3290,s3://htan-dcc-duke/3419599/5a4aeef8-5df6-4ef2-...,,2021-05-13 22:13:19+00:00
12699,syn24997035,HTA6_1026,HTAN Duke,not hispanic or latino,female,white,Dead,HTA6_1026,HTA6_1026_2,Breast NOS,...,0.488,µm,0.488,µm,HTAN Duke,mibi_imaging_level_1/raw_tifs_masks/Point4125_...,3238,s3://htan-dcc-duke/3419599/0e36a411-2dc6-4cce-...,,2021-05-13 22:11:32+00:00
12700,syn25003695,HTA6_1041,HTAN Duke,not hispanic or latino,female,white,Alive,HTA6_1041,HTA6_1041_1,Breast NOS,...,0.488,µm,0.488,µm,HTAN Duke,mibi_imaging_level_1/raw_tifs_masks/Point2316_...,3178,s3://htan-dcc-duke/3419599/151b1aef-a299-429a-...,,2021-05-13 22:11:42+00:00
12701,syn24999802,HTA6_1073,HTAN Duke,not hispanic or latino,female,white,Alive,HTA6_1073,HTA6_1073_1,Breast NOS,...,0.488,µm,0.488,µm,HTAN Duke,mibi_imaging_level_1/raw_tifs_masks/Point3102_...,3178,s3://htan-dcc-duke/3419599/6c0841c4-3b53-4698-...,,2021-05-13 22:13:42+00:00


In [9]:
imageData_df.Imaging_Assay_Type.value_counts()

MxIF       6467
MIBI       5564
CODEX       385
H&E         123
IMC          78
mIHC         54
t-CyCIF      25
CyCIF         7
Name: Imaging_Assay_Type, dtype: int64

In [10]:
imageData_df.HTAN_Center.value_counts()

HTAN Vanderbilt     6497
HTAN Duke           5564
HTAN Stanford        385
HTAN WUSTL           149
HTAN OHSU             61
HTAN TNP SARDANA      47
Name: HTAN_Center, dtype: int64

In [47]:
### Need to map the HTAN_Center to AssetStore ID...

HTAN_Center_to_DSAAssetStoreId = {'HTAN Vanderbilt': {'dsaId':'5fa9998151de21dd08ca7df8','bucketPrefix':'s3://htan-dcc-vanderbilt/'},
                                 'HTAN Duke':{'dsaId':'603574534a857d00e94c6afa','bucketPrefix':'s3://htan-dcc-duke'},
                                 'HTAN Stanford': {'dsaId':'5fa998df51de21dd08ca7df6','bucketPrefix':'s3://htan-dcc-stanford'},
                                 'HTAN WUSTL': {'dsaId':'5fa999a051de21dd08ca7df9','bucketPrefix':'gs://htan-dcc-washu'},
                                 'HTAN OHSU': {'dsaId':'5fa9989f51de21dd08ca7df4','bucketPrefix':'s3://htan-dcc-ohsu'},
                                 'HTAN TNP SARDANA':{'dsaId':'5faaeb599169619ffd829ab5','bucketPrefix':'s3://htan-dcc-tnp-sardana'}}