In [1]:
import os
import platform
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from src.functions import get_matrices_from_dfs, CELL_PRE
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf

In [None]:
if platform.system() == "Darwin":
    source_path = '/Volumes/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
else:
    source_path = '/g/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/home/mklein/FDA_project/data/Lx_Glioblastoma'

In [3]:
samples = os.listdir(source_path)

def cell_normalization_Rappez_adata(sampling_prop_matrix, sampling_spec_matrix, adata, raw_adata, sampling_prop_threshold = 0.3, sampling_spec_threshold = 0):
    
    # filter out pixels with little overlap with any cell (thus sum of all overlaps)
    pixel_sampling_prop_keep = sampling_prop_matrix.sum(axis = 0) > sampling_prop_threshold
    # filter out pixels with low contributions to a cell
    pixel_sampling_spec_keep = sampling_spec_matrix > sampling_spec_threshold

    sampling_prop_matrix_filtered = sampling_prop_matrix.sum(axis = 0) * pixel_sampling_prop_keep
    sampling_spec_matrix_filtered = sampling_spec_matrix * pixel_sampling_spec_keep

    sum_prop_matrix = sampling_prop_matrix_filtered.replace(to_replace=0, value=pd.NA)

    # create dataframe for results
    norm_ion_intensities = ad.AnnData(obs=pd.DataFrame({'cell_id': sampling_prop_matrix.index}), var=adata.var)
    norm_spots = adata.to_df().multiply(1/sum_prop_matrix, axis=0).replace(np.nan, 0)
    
    cor_df = sampling_spec_matrix_filtered.replace(np.nan, 0).dot(norm_spots)

    norm_ion_intensities.X = cor_df.multiply(1/sampling_spec_matrix_filtered.sum(axis=1), axis=0)
    norm_ion_intensities.obs.index = norm_ion_intensities.obs.cell_id.map(lambda x: x.replace(CELL_PRE, ""))

    norm_ion_intensities = norm_ion_intensities[raw_adata.obs_names]
    norm_ion_intensities.obs = raw_adata.obs
    
    return norm_ion_intensities

In [14]:
from src.functions import PIXEL_PRE, correct_intensities_quantile_regression_parallel, get_molecule_normalization_factors
import statistics as st

files = {
        'config': '../config.json',
        'sm_matrix': 'ablation_mark_analysis/spatiomolecular_adata.h5ad',
        'overlap_regions': 'overlap_analysis1/overlap.regions.csv',
        'mark_regions': 'overlap_analysis1/ablation_mark.regions.csv',
        'cell_regions': 'overlap_analysis1/cell.regions.csv',
        'cell_sm_matrix': 'single_cell_analysis/spatiomolecular_adata.h5ad',
    }

def correct_sample(sample):
    
    sample_path = os.path.join(source_path, sample, "analysis")

    if not os.path.exists(os.path.join(target_path, sample)):
        os.makedirs(os.path.join(target_path, sample))

    project_files = {k: os.path.join(sample_path, v) for k, v in files.items()}

    cell_regions = pd.read_csv(project_files['cell_regions'])
    mark_regions = pd.read_csv(project_files['mark_regions'])
    overlap_regions = pd.read_csv(project_files['overlap_regions'])

    overlap_matrix, sampling_spec_matrix = get_matrices_from_dfs(mark_area = mark_regions, cell_area = cell_regions, marks_cell_overlap = overlap_regions)

    sm_matrix = sc.read(os.path.join(sample_path, files['sm_matrix']))
    sm_matrix.obs_names = PIXEL_PRE + sm_matrix.obs_names
    cell_sm_matrix = sc.read(os.path.join(sample_path, files['cell_sm_matrix']))

    total_pixel_overlap, full_pixel_intensities_median = get_molecule_normalization_factors(sm_matrix.to_df(), overlap_matrix, method= st.median)

    corrected_intensities = correct_intensities_quantile_regression_parallel(sm_matrix.to_df(), total_pixel_overlap, full_pixel_intensities_median, reference_ions=sm_matrix.var_names, n_jobs=8)

    corr_sm_matrix = sm_matrix.copy()
    corr_sm_matrix.X = corrected_intensities

    corr_cell_sm_matrix = cell_normalization_Rappez_adata(sampling_prop_matrix=overlap_matrix, sampling_spec_matrix=sampling_spec_matrix, adata=corr_sm_matrix, raw_adata=cell_sm_matrix)

    corr_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_corrected.h5ad'))
    cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata.h5ad'))
    
    return (sample, cell_sm_matrix, corr_cell_sm_matrix)


adata_list = Parallel(n_jobs=10)(delayed(correct_sample)(sample) for sample in tqdm(samples))


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
 57%|█████▋    | 20/35 [01:16<01:06,  4.47s/it]t/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:17, 120.81it/s]

insufficient metabolites: 2045


  2%|▏         | 52/2100 [00:00<00:23, 88.09it/s] 
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]

insufficient metabolites: 2057


  4%|▍         | 88/2100 [00:01<00:30, 66.24it/s]]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:16, 125.02it/s]

insufficient metabolites: 2053


  7%|▋         | 152/2100 [00:02<00:28, 68.25it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]

insufficient metabolites: 2050
insufficient metabolites: 2044


  9%|▉         | 184/2100 [00:02<00:35, 54.24it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:21, 98.18it/s]]

insufficient metabolites: 2068


  8%|▊         | 168/2100 [00:02<00:38, 49.69it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  8%|▊         | 160/2100 [00:02<00:36, 53.50it/s]

insufficient metabolites: 2053


 21%|██        | 440/2100 [00:07<00:35, 47.09it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 11%|█         | 232/2100 [00:04<00:34, 54.02it/s]

insufficient metabolites: 2054


  1%|▏         | 27/2100 [00:00<00:30, 67.43it/s] 
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  2%|▏         | 35/2100 [00:00<00:29, 71.11it/s]]

insufficient metabolites: 2066


 24%|██▍       | 512/2100 [00:10<00:36, 43.87it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 25%|██▍       | 520/2100 [00:09<00:34, 46.13it/s]

insufficient metabolites: 2066


100%|██████████| 35/35 [02:30<00:00,  4.30s/it]t/s]


insufficient metabolites: 2042



  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  2%|▏         | 47/2100 [00:00<00:26, 77.50it/s] 
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:18, 110.93it/s]

insufficient metabolites: 2071


  4%|▍         | 88/2100 [00:01<00:35, 56.75it/s] 
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  5%|▍         | 96/2100 [00:01<00:37, 52.92it/s]]

insufficient metabolites: 2062


  4%|▎         | 75/2100 [00:00<00:28, 71.74it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  5%|▌         | 112/2100 [00:01<00:34, 58.43it/s]

insufficient metabolites: 2078


 12%|█▏        | 248/2100 [00:04<00:34, 53.45it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:16, 123.81it/s]

insufficient metabolites: 2081


 26%|██▌       | 536/2100 [00:09<00:33, 46.03it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 15%|█▌        | 320/2100 [00:06<00:36, 48.90it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 27%|██▋       | 576/2100 [00:10<00:38, 39.72it/s]

insufficient metabolites: 2037
insufficient metabolites: 2053


  2%|▏         | 32/2100 [00:00<00:25, 81.59it/s] 
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]

insufficient metabolites: 2078


 35%|███▍      | 728/2100 [00:12<00:26, 51.49it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 34%|███▎      | 704/2100 [00:13<00:33, 41.53it/s]

insufficient metabolites: 2047


 50%|█████     | 1056/2100 [00:21<00:22, 47.23it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
 52%|█████▏    | 1088/2100 [00:20<00:18, 55.48it/s]

insufficient metabolites: 2070


100%|██████████| 2100/2100 [00:41<00:00, 50.74it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  1%|          | 16/2100 [00:00<00:15, 130.93it/s]

insufficient metabolites: 2082


  3%|▎         | 54/2100 [00:00<00:20, 100.17it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  3%|▎         | 65/2100 [00:00<00:24, 82.19it/s] 

insufficient metabolites: 2064
insufficient metabolites: 2083


 13%|█▎        | 200/1503 [00:03<00:23, 56.01it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  0%|          | 0/2100 [00:00<?, ?it/s]

insufficient metabolites: 2058
insufficient metabolites: 2042


 14%|█▎        | 288/2100 [00:03<00:25, 72.30it/s]
  self._set_dim_df(value, "obs")
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 2100/2100 [00:22<00:00, 91.32it/s] 

In [15]:
adata_dict = {item[0]: item[1] for item in adata_list}
adata_cor_dict = {item[0]: item[2] for item in adata_list}