In [9]:
import numpy as np
import pandas as pd
from aicsimageprocessing import read_ome_zarr
from aicsimageio import transforms, AICSImage
import matplotlib.pyplot as plt
from serotiny.transforms.dataframe.transforms import split_dataframe
from pathlib import Path

In [2]:
# Set seed for dataloader
np.random.seed(42)

In [3]:
df = pd.read_parquet("s3://allencell-hipsc-cytodata/hackathon_manifest_17oct2022.parquet")
print(f'Number of cells: {len(df)}')
print(f'Number of columns: {len(df.columns)}')

Number of cells: 214037
Number of columns: 77


In [4]:
df_sub_fit = df[(df['fits_x']==True) &
               (df['fits_y']==True) &
               (df['fits_z']==True)
              ]
print(f'Number of cells: {len(df_sub_fit)}')

Number of cells: 184328


In [5]:
df_sub_fit_center = df_sub_fit[df_sub_fit['edge_flag'] == 0]
print(f'Number of cells: {len(df_sub_fit_center)}')

Number of cells: 180472


In [6]:
dataset = df_sub_fit_center.drop(df_sub_fit_center[df_sub_fit_center['cell_stage']=='M6M7_single'].index)
print(f'Number of cells: {len(dataset)}')

Number of cells: 176936


In [7]:
dataset['Structure'].value_counts()


mitochondria          20681
nuclear pores         15521
histones              14123
nuclear envelope      10773
nucleoli (GC)         10305
ER (SERCA2)            9352
lysosomes              8736
nucleoli (DFC)         8432
desmosomes             8169
microtubules           7679
plasma membrane        6846
actin bundles          6474
centrioles             5884
gap junctions          5214
Golgi                  5207
actomyosin bundles     5162
adherens junctions     4955
ER (Sec61 beta)        4929
tight junctions        4552
matrix adhesions       2925
actin filaments        2804
nuclear speckles       2321
endosomes              2098
cohesins               2085
peroxisomes            1709
Name: Structure, dtype: int64

In [22]:
# Create dataset csvs of different structures
Path("/home/aicsuser/serotiny_data/").mkdir(parents=True, exist_ok=True)
mitochondria = dataset[dataset['Structure'] == 'mitochondria']
nucleoli = dataset[dataset['Structure'] == 'nucleoli (GC)']
microtubules = dataset[dataset['Structure'] == 'microtubules']
plasma_membrane = dataset[dataset['Structure'] == 'plasma membrane']
actin_filaments = dataset[dataset['Structure'] == 'actin filaments']

datasets = [mitochondria, nucleoli, microtubules, plasma_membrane, actin_filaments]
structures = ['mitochondria', 'nucleoli', 'microtubules', 'plasma_membrane', 'actin_filaments']
for i, data in enumerate(datasets):
    print(structures[i])
    x = split_dataframe(dataframe=data, train_frac=0.7, val_frac=0.2, return_splits=False)
    x.to_csv(f"/home/aicsuser/serotiny_data/{structures[i]}.csv")
all_combi = pd.concat([mitochondria, nucleoli, microtubules,plasma_membrane])
x = split_dataframe(dataframe=all_combi, train_frac=0.7, val_frac=0.2, return_splits=False)
x.to_csv(f"/home/aicsuser/serotiny_data/combined.csv")
print('done')

mitochondria


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[train_ix, "split"] = "train"


nucleoli


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[train_ix, "split"] = "train"


microtubules


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[train_ix, "split"] = "train"


plasma_membrane


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[train_ix, "split"] = "train"


actin_filaments


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[train_ix, "split"] = "train"


done


stratify on structure

['bf',
 'dna',
 'membrane',
 'structure',
 'dna_segmentation',
 'membrane_segmentation',
 'struct_segmentation_roof']