# PUTATIVE WORKFLOW


## WORKFLOW EDITOR PLUGIN
- FINE-TUNE SEGMENTATIONS
  - export workflow.jsons
    - masks:
      - nuclei
      - cellmask
      - cytoplasm
    - organelles:
      - lyso
      - mito
      - golgi
      - perox
      - ER
      - LD


## BATCHPROCESS WORKFLOW
- BATCH PROCESS
  - load workflow.jsons for: 
  1. masks
    - export: masks .tiff as stack (nuclei, cellmask, cytoplasm)
  2. organelles
    - export individual .tiffs



## NOTEBOOK ~~OR ***FUTURE*** PLUGIN~~
- COLLECT ORGANELLE STATS
  - extract masks.tiffs as individual
    - nuclei, cellmask, cytoplasm
  - collect regionprops for all organelles
    - export .csvs


## NOTEBOOK ~~OR __FUTURE__ PLUGIN~~
- SUMMARIZE STUDY DATA
  - munge .csv to create summary stats across all cells/images




_____________

## TO DO
- add "segmentation name" field instead of copying from workflow.json name


- choose alternate conf_XXX.json location. 
  - strategy:  add to "prebuilt" list from path


  
  ## FILE NAME CONVENTIONS

  raw file name is kept.

  PREFIX = "segmentation name" or regionprop name.  e.g. 
  SUFFIX = "description" i.e. 

In [1]:
### These is the same import code that all of the other notebooks have
# top level imports
from pathlib import Path
import os, sys
from typing import Optional, Union, Dict, List

import numpy as np
import pandas as pd

import napari

### import local python functions in ../infer_subc
sys.path.append(os.path.abspath((os.path.join(os.getcwd(), '..'))))

from infer_subc.core.file_io import (read_czi_image,
                                        export_inferred_organelle,
                                        import_inferred_organelle,
                                        export_tiff,
                                        list_image_files)



from infer_subc.constants import *
from infer_subc.utils.stats import *
from infer_subc.utils.stats_helpers import *



import time
%load_ext autoreload
%autoreload 2



In [2]:
# this will be the example image for testing the pipeline below
# build the datapath
# all the imaging data goes here.
data_root_path = Path("C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis")

# linearly unmixed ".czi" files are here
int_data_path = data_root_path / "test_files"
im_type = ".czi"

# get the list of all files
img_file_list = list_image_files(int_data_path,im_type)

# save output ".tiff" files here
out_data_path = data_root_path / "20230606_out"

if not Path.exists(out_data_path):
    Path.mkdir(out_data_path)
    print(f"making {out_data_path}")

In [3]:
str(int_data_path).split("\\")

['C:',
 'Users',
 'redre',
 'Documents',
 'CohenLab',
 'MSI-3D-analysis',
 '20230606_test_files_practice_analysis',
 'test_files']

In [4]:
### I made a function to make this all cleaner
v = list_image_files(out_data_path,"csv")
for i in range(len(v)):
    print((v[i]).stem.split("-")[0] + "-" + (v[i]).stem.split("-")[1])
stoom = (v[0]).stem.split("-")[0] + "-" + (v[1]).stem.split("-")[1]
stoom
def correction(name : Path):
    return (name).stem.split("-")[0] + "-" + (name).stem.split("-")[1]
correction(v[0])

24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_2_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +

'24hrs-Ctrl +oleicAcid 50uM_2_Unmixing'

In [5]:
# I guess they are using this as an example image, I changed it to the one I used in the previous documents
im_path = Path(img_file_list[1])
im_path

WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi')

## 1. get each unique cells accouding to filename


### extract ID. e.g.

### process each cell & summarize



## 2. aggregate all cells into a database 

In [6]:
# I just wanted to mess with path
print(im_path.stem)
print(im_path.name)
print(im_path.drive)
print(im_path.anchor)

24hrs-Ctrl +oleicAcid 50uM_3_Unmixing
24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi
C:
C:\


In [7]:
# The name method for paths takes in the file name without the path
# cell_ids is the first part of the stem before the dash and with the new data included, there are three
full_name = im_path.name

cell_ids = [ Path(fn).stem.split("-")[0] for fn in img_file_list]
cell_ids = list(set(cell_ids))

masks_postfix = "masks2"
organelle_postfix = ["lyso", "mito","golgi","perox","ER","LD"]


In [8]:
cell_ids

['a24hrs', '24hrs', 'a48hrs']

In [9]:
# MASK process
# 1. get a listof all files based on a "prefix" and "suffix" for a given path
# dump three .tiff from the mask multichannel tiff
# from tifffile import imwrite, imread#, tiffcomment
from infer_subc.core.img import label_uint16
from infer_subc.core.file_io import export_tiff, read_tiff_image
from typing import Union
from pathlib import Path

def _explode_mask(mask_path: Union[Path,str], postfix: str= "masks", im_type: str = ".tiff") -> bool:
    """ 
    TODO: add loggin instead of printing
        append tiffcomments with provenance
    """
    if isinstance(mask_path, str): mask_path = Path(mask_path)
    # load image 
    full_stem = mask_path.stem
    if full_stem.endswith(postfix):
        stem = full_stem.rstrip(postfix)
        image = read_tiff_image(mask_path)
        assert image.shape[0]==3
        
        # make into np.uint16 labels
        nuclei = label_uint16(image[0])
        # export as np.uint8 (255)
        cellmask = image[1]>0            
        cytoplasm = image[2]>0

        # write wasks
        root_stem = mask_path.parent / stem
        # ret1 = imwrite(f"{root}nuclei{stem}", nuclei)
        ret1 = export_tiff(nuclei, f"{stem}nuc", mask_path.parent, None)
        # ret2 = imwrite(f"{root}cellmask{stem}", cellmask)
        ret2 = export_tiff(cellmask, f"{stem}cell", mask_path.parent, None)
        # ret3 = imwrite(f"{root}cytosol{stem}", cytosol)
        ret3 = export_tiff(cytoplasm, f"{stem}cyto", mask_path.parent, None)

        print(f"wrote {stem}-{{nuc,cell,cyto}}")
        return True
    else:
        return False



def _explode_masks(root_path: Union[Path,str], postfix: str= "masks", im_type: str = ".tiff"):
    """  
    TODO: add loggin instead of printing
        append tiffcomments with provenance
    """
    if isinstance(root_path, str): root_path = Path(root_path)
    img_file_list = list_image_files(root_path,im_type, postfix)
    wrote_cnt = 0
    for img_f in img_file_list:
        if _explode_mask(img_f, postfix=postfix, im_type=im_type): wrote_cnt += 1
        else: print(f"failed to explode {img_f}")
    else:
        print(f"how thefark!!! {img_f}")

    print(f"exploded {wrote_cnt*100./len(img_file_list)} pct of {len(img_file_list)} files")
    return wrote_cnt



In [10]:
from infer_subc.utils.batch import explode_masks

### The explode masks function takes in the masks from the batch files and extracts the nuclei, cellmask and cytoplasm masks

cnt = explode_masks(out_data_path, postfix='masks')
cnt

exploded 100.0 pct of 13 files


13

In [11]:


# all the imaging data goes here.
data_root_path = Path("C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis")
# linearly unmixed ".czi" files are here
raw_data_path = data_root_path / "test_files"
# save output ".tiff" files here
int_data_path = data_root_path / "20230606_out"
# save stats here
out_data_path = data_root_path / "20230606_out"


In [12]:
raw_path = raw_data_path
int_path = int_data_path
out_path = out_data_path


if isinstance(raw_path, str): raw_path = Path(raw_path)
if isinstance(int_path, str): int_path = Path(int_path)
if isinstance(out_path, str): out_path = Path(out_path)

img_file_list = list_image_files(raw_path,".czi")

if not Path.exists(out_path):
    Path.mkdir(out_path)
    print(f"making {out_path}")



In [13]:
img_file_list

[WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_2_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/a24hrs-Ctrl +oleicAcid 50uM_10_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/a24hrs-Ctrl +oleicAcid 50uM_4_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/a24hrs-Ctrl_10_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/a24hrs-Ctrl_14_Unmixing.czi'),
 WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test

In [14]:
from typing import Optional, Union, Dict, List

def _find_segmentation_tiff_files(prototype:Union[Path,str], organelles: List[str], int_path: Union[Path,str]) -> Dict:
    """
    find the nescessary image files based on prototype, the organelles involved, and paths
    """

    # raw
    prototype = Path(prototype)
    if not prototype.exists():
        print(f"bad prototype. please choose an existing `test_files` file as prototype")
        return dict()
    # make sure protoype ends with czi

    out_files = {"test_files":prototype}

    int_path = Path(int_path) 
    # raw
    if not int_path.is_dir():
        print(f"bad path argument. please choose an existing path containing organelle segmentations")
        return out_files
    
    # cyto, cellmask
    cyto_nm = int_path / f"{prototype.stem}-cyto.tiff"
    if cyto_nm.exists():
        out_files["cyto"] = cyto_nm
    else:
        print(f"cytosol mask not found.  We'll try to extract from masks ")
        if explode_mask(int_path / f"{prototype.stem}-masks.tiff"): 
            out_files["cyto"] = cyto_nm
        else: 
            print(f"failed to explode {prototype.stem}-masks.tiff")
            return out_files
    
    cellmask_nm = int_path / f"{prototype.stem}-cellmask.tiff"
    if  cellmask_nm.exists():
        out_files["cellmask"] = cellmask_nm
    else:
        print(f"cellmask file not found in {int_path} returning")
        out_files["cellmask"] = None

    # organelles
    for org_n in organelles:
        org_name = Path(int_path) / f"{prototype.stem}-{org_n}.tiff"
        if org_name.exists(): 
            out_files[org_n] = org_name
        else: 
            print(f"{org_n} .tiff file not found in {int_path} returning")
            out_files[org_n] = None
    
    return out_files

    


In [15]:
# This function finds your czi image as well as all of the tiff files
def find_segmentation_tiff_files(prototype:Union[Path,str], organelles: List[str], int_path: Union[Path,str]) -> Dict:
    """
    find the nescessary image files based on protype, the organelles involved, and paths
    """

    # raw
    prototype = Path(prototype)
    if not prototype.exists():
        print(f"bad prototype. please choose an existing `raw` file as prototype")
        return dict()
    # make sure protoype ends with czi

    out_files = {"raw":prototype}

    int_path = Path(int_path) 
    # raw
    if not int_path.is_dir():
        print(f"bad path argument. please choose an existing path containing organelle segmentations")
        return out_files
    
    # cyto, cellmask
    cyto_nm = int_path / f"{prototype.stem}-20230606_testnrm_cyto.tiff"
    if cyto_nm.exists():
        out_files["cyto"] = cyto_nm
    else:
        print(f"cytosol mask not found.  We'll try to extract from masks ")
        if explode_mask(int_path / f"{prototype.stem}-20230606_testnrm_masks.tiff"): 
            out_files["cyto"] = cyto_nm
        else: 
            print(f"failed to explode {prototype.stem}-20230606_testnrm_masks.tiff")
            return out_files
    
    cellmask_nm = int_path / f"{prototype.stem}-20230606_testnrm_cell.tiff"
    if  cellmask_nm.exists():
        out_files["cell"] = cellmask_nm
    else:
        print(f"cellmask file not found in {int_path} returning")
        out_files["cell"] = None

    # organelles
    for org_n in organelles:
        org_name = Path(int_path) / f"{prototype.stem}-20230606_testnrm_{org_n}.tiff"
        if org_name.exists(): 
            out_files[org_n] = org_name
        else: 
            print(f"{org_n} .tiff file not found in {int_path} returning")
            out_files[org_n] = None
    
    if "nuc" not in organelles:
        nuc_nm = int_path / f"{prototype.stem}-20230606_testnrm_nuc.tiff"
        if  nuc_nm.exists():
            out_files["nuc"] = nuc_nm
        else:
            print(f"nuc file not found in {int_path} returning")
            out_files["nuc"] = None



    return out_files

In [16]:
### from infer_subc.utils.batch import find_segmentation_tiff_files
from infer_subc.utils.batch import explode_mask
prototype = 'C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi'

organelles = ["nuc","lyso", "mito","golgi","perox","ER","LD"]

filez = find_segmentation_tiff_files(prototype, organelles, out_data_path)



In [17]:
filez

{'raw': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi'),
 'cyto': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/20230606_out/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing-20230606_testnrm_cyto.tiff'),
 'cell': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/20230606_out/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing-20230606_testnrm_cell.tiff'),
 'nuc': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/20230606_out/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing-20230606_testnrm_nuc.tiff'),
 'lyso': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/20230606_out/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing-20230606_testnrm_lyso.tiff'),
 'mito': WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/2

In [18]:
viewer = napari.Viewer()

In [19]:
scale = read_czi_image("C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/test_files/24hrs-Ctrl +oleicAcid 50uM_3_Unmixing.czi")[1]["scale"]

  d = to_dict(os.fspath(xml), parser=parser, validate=validate)


In [20]:
cyto_nrm = read_tiff_image(filez['cyto'])
nuc_nrm = read_tiff_image(filez['nuc'])
lyso_nrm = read_tiff_image(filez['lyso'])
mito_nrm = read_tiff_image(filez['mito'])
golgi_nrm = read_tiff_image(filez['golgi'])
perox_nrm = read_tiff_image(filez['perox'])
ER_nrm = read_tiff_image(filez['ER'])
LD_nrm = read_tiff_image(filez['LD'])

viewer.add_image((cyto_nrm), scale=scale, blending="additive")
viewer.add_image((nuc_nrm), scale=scale, blending="additive")
viewer.add_image((lyso_nrm), scale=scale, blending="additive")
viewer.add_image((mito_nrm), scale=scale, blending="additive")
viewer.add_image((golgi_nrm), scale=scale, blending="additive")
viewer.add_image((perox_nrm), scale=scale, blending="additive")
viewer.add_image((ER_nrm), scale=scale, blending="additive")
viewer.add_image((LD_nrm), scale=scale, blending="additive")

<Image layer 'LD_nrm' at 0x2b8c70cc790>

In [21]:
from infer_subc.utils.stats_helpers import make_organelle_stat_tables, dump_all_stats_tables
from infer_subc.constants import *
from infer_subc.organelles import *
from infer_subc.core.file_io import read_tiff_image, read_czi_image

# names of organelles we have
organelle_names = ["nuc","lyso", "mito","golgi","perox","ER","LD"]

# get the intensities
organelle_channels = [NUC_CH, LYSO_CH,MITO_CH,GOLGI_CH,PEROX_CH,ER_CH,LD_CH]



In [22]:
# makes the stats tables if the paths exist
# for a list of "prefixes"  collect stats + cross stats masked by cytosol (including nuclei masked by cellmask)

def _dump_all_stats_tables(int_path: Union[Path,str], 
                   out_path: Union[Path, str], 
                   raw_path: Union[Path,str], 
                   organelle_names: List[str]= ["nuclei","golgi","peroxi"], 
                   organelle_chs: List[int]= [NUC_CH,GOLGI_CH, PEROX_CH], 
                    ) -> int :
    """  
    TODO: add loggin instead of printing
        append tiffcomments with provenance
    """

    
    if isinstance(raw_path, str): raw_path = Path(raw_path)
    if isinstance(int_path, str): int_path = Path(int_path)
    if isinstance(out_path, str): out_path = Path(out_path)
    
    img_file_list = list_image_files(raw_path,".czi")

    if not Path.exists(out_path):
        Path.mkdir(out_path)
        print(f"making {out_path}")
        
    for img_f in img_file_list:
        filez = find_segmentation_tiff_files(img_f, organelle_names, int_path)
        img_data,meta_dict = read_czi_image(filez["raw"])

        # load organelles and masks
        cyto_mask = read_tiff_image(filez["cyto"])
        cellmask_obj = read_tiff_image(filez["cell"])



        # create intensities from raw as list
        intensities = [img_data[ch] for ch in organelle_chs]

        # load organelles as list
        organelles = [read_tiff_image(filez[org]) for org in organelle_names]
        
        #get mask (cyto_mask)
        nuclei_obj = organelles[ organelle_names.index("nuc") ]

        n_files = make_organelle_stat_tables(organelle_names, 
                                      organelles,
                                      intensities, 
                                      nuclei_obj,
                                      cellmask_obj,
                                      cyto_mask, 
                                      out_data_path, 
                                      img_f,
                                      n_rad_bins=5,
                                      n_zernike=9)

    return n_files



In [24]:
bb = read_czi_image(data_root_path / "test_files/24hrs-Ctrl +oleicAcid 50uM_2_Unmixing.czi")
aa = read_tiff_image(data_root_path / "20230606_out/24hrs-Ctrl +oleicAcid 50uM_2_Unmixing-20230606_testnrm_lyso.tiff")
cc 
get_summary_stats_3D = ()

  d = to_dict(os.fspath(xml), parser=parser, validate=validate)


NameError: name 'cc' is not defined

In [36]:
# int_path is the path where the tiff files live, out_path is where the stats will live
# raw_path is where the czi files are
def dump_all_stats_tables(int_path: Union[Path,str], 
                   out_path: Union[Path, str], 
                   raw_path: Union[Path,str], 
                   organelle_names: List[str]= ["nuclei","golgi","peroxi"], 
                   organelle_chs: List[int]= [NUC_CH,GOLGI_CH, PEROX_CH], 
                    ) -> int :
    """  
    TODO: add loggin instead of printing
        append tiffcomments with provenance
    """

    # If the paths are strings then they are converted to paths
    if isinstance(raw_path, str): raw_path = Path(raw_path)
    if isinstance(int_path, str): int_path = Path(int_path)
    if isinstance(out_path, str): out_path = Path(out_path)
    
    # The file list is obtained
    img_file_list = list_image_files(raw_path,".czi")
    
    # If the out folder that was stated does not exist, it creates it for you
    if not Path.exists(out_path):
        Path.mkdir(out_path)
        print(f"making {out_path}")
    
    # For each file in img_file_list the tiff files are located and the original czi image (with the key "test_files") is used to define the
    ## image data and meta dictionary
    ## The cytoplasm mask and cellmask are retrieved
    for img_f in img_file_list:
        filez = find_segmentation_tiff_files(img_f, organelle_names, int_path)
        img_data,meta_dict = read_czi_image(filez["raw"])

        # load organelles and masks
        cyto_mask = read_tiff_image(filez["cyto"])
        cellmask_obj = read_tiff_image(filez["cell"])



        # create intensities from raw as list
        intensities = [img_data[ch] for ch in organelle_chs]

        # load organelles as list
        organelles = [read_tiff_image(filez[org]) for org in organelle_names]
        
        #get mask (cyto_mask)
        nuclei_obj = organelles[ organelle_names.index("nuc") ]

        n_files = make_organelle_stat_tables(organelle_names, 
                                      organelles,
                                      intensities, 
                                      nuclei_obj,
                                      cellmask_obj,
                                      cyto_mask, 
                                      out_path, 
                                      img_f,
                                      n_rad_bins=5,
                                      n_zernike=9)

    return n_files

In [42]:

# all the imaging data goes here.
data_root_path = Path("C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis")
# linearly unmixed ".czi" files are here
raw_data_path = data_root_path / "test_files"
# save output ".tiff" files here
int_data_path = data_root_path / "20230606_out"
# save stats here
out_data_path = data_root_path / "20230606_out"

n_files = dump_all_stats_tables(out_data_path, 
                     out_data_path, 
                     raw_data_path, 
                     organelle_names=organelle_names, 
                     organelle_chs=organelle_channels)

n_files

  d = to_dict(os.fspath(xml), parser=parser, validate=validate)
  magnitude = np.sqrt(vr * vr + vi * vi) / pixels.sum()


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\redre\\Documents\\CohenLab\\MSI-3D-analysis\\20230606_test_files_practice_analysis\\20230606_out\\24hrs-Ctrl +oleicAcid 50uM_2_Unmixing-lyso-stats.csv'

# summary statistics

We now need to merge our files


-----------------
##  SUMMARY STATS  
> WARNING: (🚨🚨🚨🚨 WIP)
### normalizations.

- overlaps, normalized by CYTOPLASM, A, and B
- per cell averages, medians, std, and totals

These is all pandas munging and very straightforward tabular manipulation.


In [21]:

### Now we are going to combine those tables

data_root_path = Path(os.path.expanduser("~")) / "Projects/Imaging/data"

# linearly unmixed ".czi" files are here
int_data_path = data_root_path / "out"


In [22]:
# for a list of "prefixes"  collect stats + cross stats masked by cytosol (including nuclei masked by cellmask)

def _summarize_organelle_stats(int_path: Union[Path,str], 
                              organelle_names: List[str]= ["nuclei","golgi","peroxi"]):
    """  
    """
    # write out files... 

    if isinstance(int_path, str): int_path = Path(int_path)

    if not Path.exists(out_path):
        Path.mkdir(out_path)
        print(f"making {out_path}")

    all_stats_df = pd.DataFrame()
    all_cross_stats_df = pd.DataFrame()
    all_proj_stats_df = pd.DataFrame()
    
    for target in organelle_names:
        stat_file_list = sorted( int_path.glob(f"*{target}-stats.csv") )

        stats_df = pd.DataFrame()
        cross_stats_df = pd.DataFrame()
        proj_stats_df = pd.DataFrame()

        for stats_f in stat_file_list:
            stem = stats_f.stem.split("-")[0] + "-" + stats_f.stem.split("-")[1]
            # stats load the csv
            stats = load_stats_csv(out_path,stem, target)
            # projection stats
            proj = load_proj_stats_csv(out_path,stem, target)
            # cross stats
            cross = load_cross_stats_csv(out_path,stem, target)

            stats_df = pd.concat([stats_df,stats],axis=0, join='outer')
            proj_stats_df = pd.concat([proj_stats_df,proj],axis=0, join='outer')
            cross_stats_df = pd.concat([cross_stats_df,cross],axis=0, join='outer')
        

        ## maybe merge into all the possible files?
        # summary_df = pd.DataFrame(index=[f.stem.split("-")[0] for f in stat_file_list])
        # cross_stats_df = pd.DataFrame(index=[f.stem.split("-")[0] for f in stat_file_list])
        # proj_stats_df = pd.DataFrame(index=[f.stem.split("-")[0] for f in stat_file_list])

        summary_df = create_stats_summary(stats_df)
        summary_df.insert(loc=1,column="organelle",value=target)
        cross_summary_df = summarize_cross_stats(cross_stats_df)
        ## cross_summary_df = pivot_cross_stats(cross_stats_df)  #makes a wide version... but has a bug
        cross_summary_df.insert(loc=1,column="organelle",value=target)

        all_stats_df = pd.concat([all_stats_df,summary_df],axis=0)
        all_proj_stats_df = pd.concat([all_proj_stats_df,proj_stats_df],axis=0)
        all_cross_stats_df = pd.concat([all_cross_stats_df,cross_summary_df],axis=0)
    

    return all_stats_df, all_proj_stats_df, all_cross_stats_df
        



In [23]:
def load_stats_csv(in_path: Path, img_id: str, target_org: str) -> pd.DataFrame:
    """ helper to load the basic stats csv: `img_id`-`target_organelle` -stats.csv
    returns pandas DataFrame """
    csv_path = in_path / f"{img_id}-{target_org}-stats.csv"
    stats = pd.read_csv(csv_path, index_col=0,dtype={"ID":str,"organelle":str})
    # need to convert columns *_labels
    list_cols = [col for col in stats.columns if col.endswith('_labels')]
    stats = fix_int_list_cols(stats,list_cols)
    return stats
        

def load_proj_stats_csv(in_path: Path, img_id: str, target_org: str) -> pd.DataFrame:
    """ helper to load  the projection stats csv: `img_id`-`target_organelle` -proj-stats.csv
    returns pandas DataFrame """
    # obj_cols =  ['ID', 'organelle','radial_n_bins','n_z']  # leave alone
    # str_cols = [ 'radial_bins']
    int_cols = ['radial_cm_vox_cnt', 'radial_org_vox_cnt', 'radial_org_intensity', 'radial_n_pix','zernike_n', 'zernike_m', 'z','z_cm_vox_cnt','z_org_vox_cnt', 'z_org_intensity', 'z_nuc_vox_cnt']
    float_cols = ['radial_cm_cv', 'radial_org_cv', 'radial_img_cv','zernike_cm_mag', 'zernike_cm_phs','zernike_obj_mag', 'zernike_obj_phs', 'zernike_nuc_mag','zernike_nuc_phs', 'zernike_img_mag']

    csv_path = in_path / f"{img_id}-{target_org}-proj-stats.csv"
    proj = pd.read_csv(csv_path, index_col=0)
    proj['radial_bins'] = proj['radial_bins'].values.squeeze().tolist()
    # proj = fix_str_list_cols(proj, str_cols)
    proj = fix_int_list_cols(proj, int_cols)
    proj = fix_float_list_cols(proj, float_cols)
    return proj
        

def load_cross_stats_csv(in_path: Path, img_id: str, target_org: str) -> pd.DataFrame:
    """ helper to load  the cross- stats csv: `img_id`-`target_organelle` -cross-stats.csv
    returns pandas DataFrame """
    csv_path = in_path / f"{img_id}-{target_org}-cross-stats.csv"
    cross = pd.read_csv(csv_path, index_col=0)
    return cross

In [105]:
int_path, organelle_names

(WindowsPath('C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis/20230606_out'),
 ['nuc', 'lyso', 'mito', 'golgi', 'perox', 'ER', 'LD'])

In [25]:

from infer_subc.utils.stats_helpers import summarize_organelle_stats, dump_organelle_summary_tables

# all the imaging data goes here.
data_root_path = Path("C:/Users/redre/Documents/CohenLab/MSI-3D-analysis/20230606_test_files_practice_analysis")
# linearly unmixed ".czi" files are here
raw_data_path = data_root_path / "test_files"
# save output ".tiff" files here could be different than out
int_data_path = data_root_path / "20230606_out"
# save stats here
out_data_path = data_root_path / "20230606_out"

#Creates a summary of the data and combines/concatenates them into the corresponding pandas tables

all_stats_df, all_proj_stats_df, all_cross_stats_df = _summarize_organelle_stats( int_path, organelle_names )


In [26]:
all_cross_stats_df

Unnamed: 0,volume_sum,organelle,volume_mean,volume_median,volume_min,volume_max,volume_std,volume_count,equivalent_diameter_sum,equivalent_diameter_mean,...,shell_surface_area_sum,shell_surface_area_mean,shell_surface_area_median,shell_surface_area_min,shell_surface_area_max,shell_surface_area_std,shell_surface_area_count,shell_label__lst,shell_label_a_lst,shell_label_b_lst
0,11229,lyso,98.500000,13.5,1,4825,465.957464,114,411.400217,3.608774,...,22621.086073,186.951125,50.824539,6.928203,4795.984863,495.056522,121,"[2_1, 5_1, 7_1, 9_1, 10_1, 13_1, 4_1, 13_1, 14...","[2.0, 5.0, 7.0, 9.0, 10.0, 13.0, 4.0, 13.0, 14...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,1102,lyso,78.714286,23.5,1,355,102.839254,14,59.139430,4.224245,...,1665.824014,118.987430,60.180458,6.928203,370.871277,124.164551,14,"[22_42, 6_46, 27_75, 32_101, 22_68, 25_18, 42_...","[22.0, 6.0, 27.0, 32.0, 22.0, 25.0, 42.0, 45.0...","[42.0, 46.0, 75.0, 101.0, 68.0, 18.0, 84.0, 3...."
2,1800,lyso,112.500000,48.0,1,423,119.046210,16,83.015993,5.188500,...,3674.489944,229.655621,158.008888,6.928203,548.857422,173.461840,16,"[11_3, 14_5, 22_2, 4_2, 6_3, 22_2, 19_2, 6_2, ...","[11.0, 14.0, 22.0, 4.0, 6.0, 22.0, 19.0, 6.0, ...","[3.0, 5.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, ..."
3,3082,lyso,44.028571,14.0,1,441,80.125259,70,240.222937,3.431756,...,8436.417954,118.822788,56.947926,6.928203,1067.907471,180.562192,71,"[10_4, 4_23, 15_34, 17_25, 18_4, 6_26, 6_4, 16...","[10.0, 4.0, 15.0, 17.0, 18.0, 6.0, 6.0, 16.0, ...","[4.0, 23.0, 34.0, 25.0, 4.0, 26.0, 4.0, 18.0, ..."
4,142,lyso,10.923077,4.0,1,51,14.268039,13,30.887463,2.375959,...,464.462855,35.727912,21.513260,6.928203,92.804329,28.647155,13,"[6_54, 29_55, 38_61, 29_99, 21_67, 6_102, 21_1...","[6.0, 29.0, 38.0, 29.0, 21.0, 6.0, 21.0, 54.0,...","[54.0, 55.0, 61.0, 99.0, 67.0, 102.0, 107.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1794,LD,33.222222,9.5,1,447,84.118664,54,163.387120,3.025687,...,4558.727007,84.420871,40.826969,6.928203,898.259216,155.745454,54,"[1_1, 13_1, 17_1, 18_1, 1_1, 26_1, 28_1, 29_1,...","[1.0, 13.0, 17.0, 18.0, 1.0, 26.0, 28.0, 29.0,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
57,111,LD,12.333333,7.0,3,31,10.839742,9,23.995991,2.666221,...,431.541039,47.949004,32.362869,18.241911,101.189835,32.274704,9,"[14_1, 14_1, 87_1, 94_1, 100_19, 101_19, 88_1,...","[14.0, 14.0, 87.0, 94.0, 100.0, 101.0, 88.0, 9...","[1.0, 1.0, 1.0, 1.0, 19.0, 19.0, 1.0, 20.0, 1.0]"
58,3,LD,3.000000,3.0,3,3,,1,1.789400,1.789400,...,17.049160,17.049160,17.049160,17.049160,17.049160,,1,[120_161],[120.0],[161.0]
59,536,LD,15.764706,8.5,1,173,30.348540,34,88.391752,2.599757,...,1821.901451,53.585337,38.806271,6.928203,391.502563,69.441030,34,"[29_29, 31_15, 33_5, 33_5, 10_37, 42_29, 33_5,...","[29.0, 31.0, 33.0, 33.0, 10.0, 42.0, 33.0, 48....","[29.0, 15.0, 5.0, 5.0, 37.0, 29.0, 5.0, 9.0, 2..."


In [27]:
def _dump_organelle_summary_tables(
                    int_path: Union[Path,str], 
                    out_path: Union[Path, str], 
                    organelle_names: List[str]= ["nuclei","golgi","peroxi"] ) -> int:
    """
    get summary and all cross stats between organelles `a` and `b`
    calls `get_summary_stats_3D`
    """

    if not Path.exists(out_path):
        Path.mkdir(out_path)
        print(f"making {out_path}")


    all_stats_df, all_proj_stats_df, all_cross_stats_df = _summarize_organelle_stats( int_path, organelle_names )

    csv_path = out_path / f"summary-stats.csv"
    all_stats_df.to_csv(csv_path)

    csv_path = out_path / f"summary-proj-stats.csv"
    all_proj_stats_df.to_csv(csv_path)

    csv_path = out_path / f"summary-cross-stats.csv"
    all_cross_stats_df.to_csv(csv_path)

    return 1



In [28]:
n_files = _dump_organelle_summary_tables(out_data_path, 
                     out_data_path, 
                     organelle_names)

n_files

1

Make some wrappers to deal with reading our summary stats into pandas properly.


In [29]:


def load_summary_stats_csv(in_path: Path) -> pd.DataFrame:
    """ helper to load the summary stats csv: summary-stats.csv
    returns pandas DataFrame """
    csv_path = in_path / f"summary-stats.csv"
    summary_df = pd.read_csv(csv_path, index_col=0)
    # need to convert columns *_labels
    list_cols = [col for col in summary_df.columns if "labels" in col] #if col.contains("label")
    summary_df = fix_int_list_cols(summary_df,list_cols)
    return summary_df


def load_summary_proj_stats_csv(in_path: Path) -> pd.DataFrame:
    """ helper to load summary projection stats csv: summary-proj-stats.csv
    returns pandas DataFrame """
    obj_cols =  ['ID', 'organelle','mask','radial_n_bins','n_z']  # leave alone
    str_cols = [ 'radial_bins']
    int_cols = ['radial_cm_vox_cnt', 'radial_org_vox_cnt', 'radial_org_intensity', 'radial_n_pix','zernike_n', 'zernike_m', 'z','z_cm_vox_cnt','z_org_vox_cnt', 'z_org_intensity', 'z_nuc_vox_cnt']
    float_cols = ['radial_cm_cv', 'radial_org_cv', 'radial_img_cv','zernike_cm_mag', 'zernike_cm_phs','zernike_obj_mag', 'zernike_obj_phs', 'zernike_nuc_mag','zernike_nuc_phs', 'zernike_img_mag']

    csv_path = in_path / f"summary-proj-stats.csv"
    proj = pd.read_csv(csv_path, index_col=0)
    proj = fix_str_list_cols(proj, str_cols)
    proj = fix_int_list_cols(proj, int_cols)
    proj = fix_float_list_cols(proj, float_cols)
    return proj
        

def load_summary_cross_stats_csv(in_path: Path) -> pd.DataFrame:
    """ helper to load summary cross- stats csv: summary-cross-stats.csv
    returns pandas DataFrame """

    csv_path = in_path / f"summary-cross-stats.csv"
    summary_df = pd.read_csv(csv_path, index_col=0)

    list_cols = [col for col in summary_df.columns if "label" in col] #if col.contains("label")
    str_list_cols = [col for col in list_cols if "__" in col]
    int_list_cols = [col for col in list_cols if "__" not in col]

    summary_df = fix_str_list_cols(summary_df,str_list_cols)
    summary_df = fix_int_list_cols(summary_df,int_list_cols)

    return summary_df
    


In [1]:

#summary_shell.head()
test = load_summary_stats_csv(out_data_path)
test_proj = load_summary_proj_stats_csv(out_data_path)
test_cross = load_summary_cross_stats_csv(out_data_path)

NameError: name 'load_summary_stats_csv' is not defined

AttributeError: 'DataFrame' object has no attribute 'max_columns'