In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import scanpy as sc

import glob
import os
import scrna
import h5py

from micron2 import load_as_anndata
from micron2 import cluster_leiden_cu

import tqdm.auto as tqdm

from matplotlib import pyplot as plt
from matplotlib import rcParams
import time

from micron2.data import staining_border_nonzero

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import rcParams
import seaborn as sns

In [None]:
toss_1 = [f'TMA1_reg{x}_v5' for x in [1,3,11,21,22,23,24,27,35]]
toss_2 = [f'TMA2_reg{x}_v5' for x in [2,3,14,15,16,20,21,22,23,24,26,34]]
toss_3 = [f'TMA3_reg{x}_v5' for x in [1,2,7,8,14,15,20,21,23,25,31,32,33]]
toss_patterns = toss_1 + toss_2 + toss_3
def maybe_toss(x):
    for p in toss_patterns:
        if p in x:
            return True
    return False

In [None]:
datasets = !ls -lha /storage/codex/preprocessed_data/*Bladder*/*_v5.hdf5 | awk '{print $9}'
datasets = [d for d in datasets if 'reg0' not in d]
datasets = [d for d in datasets if not maybe_toss(d)]
print(len(datasets))
datasets

In [None]:
ring_channels = ['CD45', 'CD20', 'CD3e', 'CD45RO', 'CD45RA', 'CD8', 'CD4', 'CDH12', 
                 'KRT13', 'KRT17', 'PanCytoK', 'ERBB2']
adatas = []
sample_ids = []
for path in datasets:
    try:
        ad = load_as_anndata(path, 
                             recover_tile_nuclei=False, 
                             as_sparse = False,
                             features_dtype = None
                            )
    except:
        print('failed to load', path)
        continue
    
    # Apply staining border function
    with h5py.File(ad.uns['source_data'], 'r') as h5f:
        ncells = ad.shape[0]
        h5_ncells = h5f['cells/DAPI'].shape[0]
        print(ncells, h5_ncells)
        ring_positive_pct = pd.DataFrame(index=ad.obs_names,
                                         columns=[f'{ch}_ringpct' for ch in ring_channels],
                                         dtype=np.float32
                                        )
        tstart = time.time()
        for i in tqdm.trange(ncells):
            m = h5f['meta/nuclear_masks'][i:i+1,:,:]
            vect = []
            for ch in ring_channels:
                x = h5f[f'cells/{ch}'][i:i+1,:,:]
                v = staining_border_nonzero(x,m)
                vect.append(v)
            ring_positive_pct.loc[ad.obs_names[i],:] = vect
        tend = time.time()
        print(f'elapsed time: {tend-tstart:3.4f}s')
    ad.obs = pd.concat([ad.obs, ring_positive_pct], axis=1)

    
    adatas.append(ad.copy()) 
    s = os.path.splitext(os.path.basename(path))[0]
    sample_ids.append(s)

adata = adatas[0].concatenate(adatas[1:], batch_key='sample_id', batch_categories=sample_ids, 
                              index_unique = '-')
print(sample_ids)
adata.raw = adata


In [None]:
len(sample_ids)

In [None]:
for ad in adatas:
    print(ad.shape, ad.obs.shape)
    df = ad.obs
    ad.obs = ad.obs.loc[:, ~df.columns.duplicated()]
    
adata = adatas[0].concatenate(adatas[1:], batch_key='sample_id', batch_categories=sample_ids, 
                              index_unique = '-')
print(sample_ids)
adata.raw = adata

In [None]:
sample_id_printing = ['\n'.join(x.split('_')[2:4]) for x in adata.obs.sample_id]
# sample_id_printing = [x.split('Breast_')[1].replace('_','\n') for x in adata.obs.sample_id]
adata.obs['sample_id_printing'] = sample_id_printing
adata.uns['channels'] = ad.uns['channels']

In [None]:
adata

In [None]:
!ls -lha /storage/codex/datasets_v1/

In [None]:
adata.write("/storage/codex/datasets_v1/bladder_merged.h5ad")

In [None]:
!ls -lha /storage/codex/datasets_v1/