In [1]:
import os
import glob
import pandas as pd
import numpy as np
import pycytominer
import pyarrow.parquet as pq

os.chdir('./output/')

In [2]:
# return a list of unique plates in dataset
def dividePlates(lst):
    dct = {}
 
    for p in lst:
        acqID = p.split('/')[6]
        
        if acqID not in dct:
            dct[(acqID)] = acqID
     
    res = []
    
    for key in sorted(dct):
        res.append(dct[key])
     
    return res

In [3]:
path = "/share/data/cellprofiler/automation/results/"

feat_folder = path + 'EDC*120*P*'
cells = sorted(glob.glob(feat_folder + '/*/*/featICF_cells*')) 
cyto = sorted(glob.glob(feat_folder + '/*/*/featICF_cyto*')) 
nuclei = sorted(glob.glob(feat_folder + '/*/*/featICF_nuclei*')) 

In [4]:
list_of_plates = dividePlates(cells)
#list_of_plates = list_of_plates[0:2] + list_of_plates[4:5]

list_of_plates

['EDC-v2-FA-MCF7-120h-P3-L1',
 'EDC-v2-FA-MCF7-120h-P4-L2',
 'EDC-v3-FA-MCF7-120h-P1-L1',
 'EDC-v3-FA-MCF7-120h-P2-L1']

In [5]:
d = {}

for plate in list_of_plates:
    
    selected_features = []
    
    #nuclei
    all_nuclei = [s for s in nuclei if plate in s]
    sel_nuclei = max(all_nuclei , key = os.path.getctime)
    selected_features.append(sel_nuclei)
    
    #cytoplasm
    all_cyto = [s for s in cyto if plate in s]
    sel_cyto = max(all_cyto , key = os.path.getctime)
    selected_features.append(sel_cyto)
    
    #cells
    all_cells = [s for s in cells if plate in s]
    sel_cells = max(all_cells , key = os.path.getctime)
    selected_features.append(sel_cells)

    d[plate] = selected_features
    

d

{'EDC-v2-FA-MCF7-120h-P3-L1': ['/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P3-L1/2963/3110/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P3-L1/2963/3110/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P3-L1/2963/3110/featICF_cells.parquet'],
 'EDC-v2-FA-MCF7-120h-P4-L2': ['/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P4-L2/2918/2765/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P4-L2/2918/2765/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/results/EDC-v2-FA-MCF7-120h-P4-L2/2918/2765/featICF_cells.parquet'],
 'EDC-v3-FA-MCF7-120h-P1-L1': ['/share/data/cellprofiler/automation/results/EDC-v3-FA-MCF7-120h-P1-L1/2966/3149/featICF_nuclei.parquet',
  '/share/data/cellprofiler/automation/results/EDC-v3-FA-MCF7-120h-P1-L1/2966/3149/featICF_cytoplasm.parquet',
  '/share/data/cellprofiler/automation/r

In [6]:
import datetime
x = datetime.datetime.now()
date = (x.strftime("%x")) 
time = (x.strftime("%X")) 

print(date, time)

03/25/23 23:25:38


In [7]:
collected_df = []

for i, key in enumerate(d):
    
    n =  (d[key][0])
    cy = (d[key][1])
    ce = (d[key][2])
    nuclei = pd.read_parquet(n).add_prefix('Nuclei_').reset_index()
    cytoplasm = pd.read_parquet(cy).add_prefix('Cytoplasm_').reset_index()
    cells = pd.read_parquet(ce).add_prefix('Cells_').reset_index()
    
    #------------------- MERGE NUCLEI CYTOPLASM AND CELL objects ----------------#
    
    # step 1: Take the mean values of multiple nuclei belonging to one cell
    nuclei = nuclei.groupby(["Nuclei_Metadata_Barcode","Nuclei_Metadata_Well","Nuclei_Metadata_Site","Nuclei_Parent_cells"]).mean()
    
    # step 2: merge nuclei and cytoplasm objects
    new_df = pd.merge(nuclei, cytoplasm, how='left', left_on=['Nuclei_Metadata_Barcode','Nuclei_Metadata_Well',"Nuclei_Metadata_Site","Nuclei_Parent_cells"],
                  right_on = ['Cytoplasm_Metadata_Barcode','Cytoplasm_Metadata_Well',"Cytoplasm_Metadata_Site","Cytoplasm_ObjectNumber"])
    # step 3: join cells objects
    new_df = pd.merge(new_df, cells, how='left', left_on=['Cytoplasm_Metadata_Barcode','Cytoplasm_Metadata_Well',"Cytoplasm_Metadata_Site","Cytoplasm_ObjectNumber"],
                  right_on = ['Cells_Metadata_Barcode','Cells_Metadata_Well',"Cells_Metadata_Site","Cells_ObjectNumber"])
    
    #------------------- format feature names to BROAD names --------------------------------#
    first_column = new_df.pop('Cells_Metadata_Barcode')
    second_column = new_df.pop('Cells_Metadata_Well')
    third_column = new_df.pop('Cells_Metadata_Site')
    
    new_df.insert(0, 'Metadata_plate_map_name', first_column)
    new_df.insert(1, 'Metadata_Well', second_column)
    new_df.insert(2, 'Metadata_Site', third_column)
    
    # ------------------ add image indentifyer for metdata and QC -----------------------------#
    new_df['ImageID'] =  new_df['Metadata_plate_map_name'] + "_" + new_df['Metadata_Well'] + "_s" + new_df['Metadata_Site'].astype(str).replace(r'\.0$', '', regex=True)
    
    # clean up #
    new_df = new_df.loc[:,~new_df.columns.str.contains('Unnamed|index|Cytoplasm_Meta|Cells_Meta|Nuclei_Meta|FileName|PathName|_ImageNumber')]
    
    # change to BROAD names
    
    new_df.columns = new_df.columns.str.replace(r"illumSYTO", "RNA")
    new_df.columns = new_df.columns.str.replace(r"illumCONC", "ER")
    new_df.columns = new_df.columns.str.replace(r"illumHOECHST", "DNA")
    new_df.columns = new_df.columns.str.replace(r"illumPHAandWGA", "AGP")
    new_df.columns = new_df.columns.str.replace(r"illumMITO", "Mito")
    
           
    print("plate",list_of_plates[i], "contains a total of", len(new_df), "rows")
    
    # new
    #new_df.to_csv("level1_agg_{}.csv".format(list_of_plates[i])) # save as csv
    new_df.to_parquet("level1_agg_{}.parquet".format(list_of_plates[i])) # save as parquet
                  
    #save to master df 
    collected_df.append(new_df)
    
dfs = pd.concat(collected_df)
# dfs.to_csv("level1_merge.csv", index=False) # save as csv
dfs.to_parquet("level1_merge.parquet", index=False) # save as parquet

plate EDC-v2-FA-MCF7-120h-P3-L1 contains a total of 356784 rows
plate EDC-v2-FA-MCF7-120h-P4-L2 contains a total of 795017 rows
plate EDC-v3-FA-MCF7-120h-P1-L1 contains a total of 67425 rows
plate EDC-v3-FA-MCF7-120h-P2-L1 contains a total of 203853 rows


In [8]:
print(date, time)

03/25/23 23:25:38
