In [1]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from src.functions import get_matrices_from_dfs, normalize_proportion_ratios
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf

In [2]:
source_path = '/Volumes/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
samples = os.listdir(source_path)

In [3]:
# this is how martijn performed the calculations
from src.functions import CELL_PRE


def cell_normalization_Rappez_adata(sampling_prop_matrix, sampling_spec_matrix, adata, raw_adata, sampling_prop_threshold = 0.3, sampling_spec_threshold = 0):
    
    # filter out pixels with little overlap with any cell (thus sum of all overlaps)
    pixel_sampling_prop_keep = sampling_prop_matrix.sum(axis = 0) > sampling_prop_threshold
    # filter out pixels with low contributions to a cell
    pixel_sampling_spec_keep = sampling_spec_matrix > sampling_spec_threshold

    sampling_prop_matrix_filtered = sampling_prop_matrix.sum(axis = 0) * pixel_sampling_prop_keep
    sampling_spec_matrix_filtered = sampling_spec_matrix * pixel_sampling_spec_keep

    sum_prop_matrix = sampling_prop_matrix_filtered.replace(to_replace=0, value=pd.NA)

    # create dataframe for results
    norm_ion_intensities = ad.AnnData(obs=pd.DataFrame({'cell_id': sampling_prop_matrix.index}), var=adata.var)
    norm_spots = adata.to_df().multiply(1/sum_prop_matrix, axis=0).replace(np.nan, 0)
    
    cor_df = sampling_spec_matrix_filtered.replace(np.nan, 0).dot(norm_spots)

    norm_ion_intensities.X = cor_df.multiply(1/sampling_spec_matrix_filtered.sum(axis=1), axis=0)
    norm_ion_intensities.obs.index = norm_ion_intensities.obs.cell_id.map(lambda x: x.replace(CELL_PRE, ""))

    norm_ion_intensities = norm_ion_intensities[raw_adata.obs_names]
    norm_ion_intensities.obs = raw_adata.obs
    
    return norm_ion_intensities

In [11]:
from src.functions import PIXEL_PRE, correct_intensities_quantile_regression_parallel, get_molecule_normalization_factors
import statistics as st


def correct_sample(sample):
    
    sample_path = os.path.join(source_path, sample, "analysis")

    if not os.path.exists(os.path.join(target_path, sample)):
        os.makedirs(os.path.join(target_path, sample))


    files = {
        'config': '../config.json',
        'sm_matrix': 'ablation_mark_analysis/spatiomolecular_adata.h5ad',
        'overlap_regions': 'overlap_analysis1/overlap.regions.csv',
        'mark_regions': 'overlap_analysis1/ablation_mark.regions.csv',
        'cell_regions': 'overlap_analysis1/cell.regions.csv',
        'cell_sm_matrix': 'single_cell_analysis/spatiomolecular_adata.h5ad',
    }

    project_files = {k: os.path.join(sample_path, v) for k, v in files.items()}

    cell_regions = pd.read_csv(project_files['cell_regions'])
    mark_regions = pd.read_csv(project_files['mark_regions'])
    overlap_regions = pd.read_csv(project_files['overlap_regions'])

    overlap_matrix, sampling_spec_matrix = get_matrices_from_dfs(mark_area = mark_regions, cell_area = cell_regions, marks_cell_overlap = overlap_regions)

    sm_matrix = sc.read(os.path.join(sample_path, files['sm_matrix']))
    sm_matrix.obs_names = PIXEL_PRE + sm_matrix.obs_names
    cell_sm_matrix = sc.read(os.path.join(sample_path, files['cell_sm_matrix']))

    total_pixel_overlap, full_pixel_intensities_median = get_molecule_normalization_factors(sm_matrix.to_df(), overlap_matrix, method= st.median)

    corrected_intensities = correct_intensities_quantile_regression_parallel(sm_matrix.to_df(), total_pixel_overlap, full_pixel_intensities_median, reference_ions=sm_matrix.var_names, n_jobs=10)

    corr_sm_matrix = sm_matrix.copy()
    corr_sm_matrix.X = corrected_intensities

    corr_cell_sm_matrix = cell_normalization_Rappez_adata(sampling_prop_matrix=overlap_matrix, sampling_spec_matrix=sampling_spec_matrix, adata=corr_sm_matrix, raw_adata=cell_sm_matrix)

    corr_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_corrected.h5ad'))
    cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata.h5ad'))
    
    return True


Parallel(n_jobs=10)(delayed(correct_sample)(sample) for sample in tqdm(samples[0:11]))








100%|██████████| 11/11 [00:00<00:00, 35.58it/s]
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
100%|██████████| 2100/2100 [00:34<00:00, 61.13it/s]
100%|██████████| 2100/2100 [00:34<00:00, 60.76it/s]
 97%|█████████▋| 2040/2100 [00:32<00:00, 69.18it/s]

insufficient metabolites: 
['C20H34O5+H', 'C10H20NO8P+H', 'C10H20NO8P+Na', 'C10H20NO8P+K', 'C20H34O5+K', 'C21H38O3+K', 'C21H38O3+Na', 'C20H34O5+Na', 'C21H38O3+H', 'C21H39O6P+H', 'C21H39O6P+K', 'C21H40O3+Na', 'C21H39O6P+Na', 'C21H40O3+H', 'C21H40O3+K', 'C21H41O6P+K', 'C21H41O6P+H', 'C21H44NO7P+Na', 'C21H41O7P+H', 'C21H41O7P+K', 'C21H44NO7P+H', 'C22H44NO6P+H', 'C21H41O7P+Na', 'C22H44NO6P+K', 'C21H41O6P+Na', 'C21H44NO7P+K', 'C22H44NO6P+Na', 'C22H46NO7P+H', 'C23H36O3+H', 'C22H46NO7P+K', 'C22H46NO7P+Na', 'C23H36O3+K', 'C23H36O3+Na', 'C23H39O7P+K', 'C23H46NO6P+Na', 'C23H46NO6P+K', 'C23H39O7P+Na', 'C23H39O7P+H', 'C23H47O11P+H', 'C23H46NO6P+H', 'C23H47O11P+Na', 'C23H47O11P+K', 'C23H48NO7P+K', 'C23H48NO7P+H', 'C23H48NO7P+Na', 'C24H43O8P+H', 'C24H43O8P+K', 'C24H46NO8P+K', 'C24H43O8P+Na', 'C24H46NO8P+Na', 'C24H46NO8P+H', 'C24H48NO7P+H', 'C24H48NO7P+Na', 'C24H50NO6P+H', 'C24H48NO7P+K', 'C24H50NO6P+Na', 'C24H48NO6P+K', 'C25H44NO7P+H', 'C25H44NO7P+K', 'C24H50NO6P+K', 'C25H44NO7P+Na', 'C25H45O8P+H', 

100%|██████████| 2100/2100 [00:33<00:00, 62.81it/s]
 73%|███████▎  | 1530/2100 [00:23<00:07, 71.88it/s]

insufficient metabolites: 
['C10H20NO8P+K', 'C21H38O3+K', 'C10H20NO8P+H', 'C21H38O3+H', 'C10H20NO8P+Na', 'C20H34O5+H', 'C20H34O5+K', 'C20H34O5+Na', 'C21H38O3+Na', 'C21H39O6P+H', 'C21H39O6P+K', 'C21H39O6P+Na', 'C21H40O3+K', 'C21H40O3+H', 'C21H41O6P+H', 'C21H40O3+Na', 'C21H41O6P+Na', 'C21H41O6P+K', 'C21H41O7P+H', 'C21H41O7P+K', 'C21H44NO7P+H', 'C21H44NO7P+K', 'C21H44NO7P+Na', 'C21H41O7P+Na', 'C22H44NO6P+H', 'C22H44NO6P+Na', 'C22H46NO7P+Na', 'C23H36O3+H', 'C23H36O3+K', 'C22H46NO7P+K', 'C23H39O7P+H', 'C23H36O3+Na', 'C22H46NO7P+H', 'C23H39O7P+K', 'C23H46NO6P+H', 'C23H39O7P+Na', 'C22H44NO6P+K', 'C23H46NO6P+Na', 'C23H47O11P+H', 'C23H47O11P+K', 'C23H46NO6P+K', 'C24H43O8P+Na', 'C23H48NO7P+H', 'C24H46NO8P+H', 'C24H43O8P+K', 'C24H46NO8P+K', 'C24H43O8P+H', 'C23H48NO7P+K', 'C24H46NO8P+Na', 'C23H47O11P+Na', 'C23H48NO7P+Na', 'C24H50NO6P+H', 'C24H48NO7P+K', 'C24H48NO6P+K', 'C24H48NO7P+Na', 'C24H48NO7P+H', 'C24H50NO6P+Na', 'C24H50NO6P+K', 'C25H44NO7P+K', 'C25H44NO7P+H', 'C25H45O8P+K', 'C25H45O8P+H', 'C

 76%|███████▌  | 1600/2100 [00:25<00:07, 65.05it/s]

insufficient metabolites: 
['C20H34O5+Na', 'C10H20NO8P+Na', 'C20H34O5+H', 'C21H38O3+K', 'C21H38O3+H', 'C20H34O5+K', 'C10H20NO8P+H', 'C21H39O6P+H', 'C10H20NO8P+K', 'C21H38O3+Na', 'C21H39O6P+Na', 'C21H40O3+Na', 'C21H40O3+K', 'C21H41O6P+H', 'C21H40O3+H', 'C21H41O7P+H', 'C21H39O6P+K', 'C21H41O6P+Na', 'C21H41O7P+Na', 'C21H41O6P+K', 'C21H44NO7P+H', 'C21H41O7P+K', 'C21H44NO7P+Na', 'C21H44NO7P+K', 'C22H44NO6P+H', 'C22H44NO6P+Na', 'C22H46NO7P+H', 'C22H44NO6P+K', 'C23H36O3+H', 'C23H39O7P+K', 'C22H46NO7P+K', 'C22H46NO7P+Na', 'C23H36O3+K', 'C23H36O3+Na', 'C23H39O7P+H', 'C23H39O7P+Na', 'C23H46NO6P+H', 'C23H46NO6P+K', 'C23H47O11P+H', 'C23H47O11P+Na', 'C23H47O11P+K', 'C24H43O8P+H', 'C23H48NO7P+H', 'C23H46NO6P+Na', 'C23H48NO7P+Na', 'C24H43O8P+K', 'C23H48NO7P+K', 'C24H46NO8P+H', 'C24H48NO6P+K', 'C24H46NO8P+K', 'C24H46NO8P+Na', 'C24H48NO7P+K', 'C24H48NO7P+H', 'C24H48NO7P+Na', 'C24H50NO6P+H', 'C25H44NO7P+H', 'C24H50NO6P+Na', 'C25H44NO7P+Na', 'C25H44NO7P+K', 'C24H50NO6P+K', 'C25H45O8P+H', 'C25H45O8P+K', '

100%|██████████| 2100/2100 [00:34<00:00, 61.13it/s]
 92%|█████████▏| 1930/2100 [00:29<00:02, 71.61it/s]

insufficient metabolites: 
['C20H34O5+K', 'C21H39O6P+H', 'C21H38O3+K', 'C10H20NO8P+K', 'C21H38O3+H', 'C20H34O5+H', 'C21H38O3+Na', 'C10H20NO8P+H', 'C10H20NO8P+Na', 'C20H34O5+Na', 'C21H39O6P+Na', 'C21H39O6P+K', 'C21H40O3+Na', 'C21H41O6P+K', 'C21H41O7P+H', 'C21H40O3+K', 'C21H41O6P+H', 'C21H40O3+H', 'C21H41O6P+Na', 'C21H41O7P+K', 'C21H41O7P+Na', 'C21H44NO7P+H', 'C22H44NO6P+H', 'C21H44NO7P+Na', 'C22H44NO6P+K', 'C21H44NO7P+K', 'C22H46NO7P+K', 'C23H36O3+K', 'C22H46NO7P+Na', 'C22H44NO6P+Na', 'C22H46NO7P+H', 'C23H36O3+Na', 'C23H36O3+H', 'C23H39O7P+H', 'C23H39O7P+K', 'C23H39O7P+Na', 'C23H46NO6P+H', 'C23H46NO6P+Na', 'C23H46NO6P+K', 'C23H48NO7P+K', 'C23H48NO7P+H', 'C23H47O11P+H', 'C23H48NO7P+Na', 'C23H47O11P+K', 'C23H47O11P+Na', 'C24H43O8P+K', 'C24H46NO8P+H', 'C24H43O8P+H', 'C24H48NO6P+K', 'C24H46NO8P+Na', 'C24H46NO8P+K', 'C24H43O8P+Na', 'C24H48NO6P+Na', 'C24H48NO7P+H', 'C24H48NO7P+K', 'C24H50NO6P+K', 'C25H44NO7P+K', 'C24H48NO7P+Na', 'C25H44NO7P+Na', 'C24H50NO6P+H', 'C24H50NO6P+Na', 'C25H44NO7P+H'

100%|██████████| 2100/2100 [00:31<00:00, 67.16it/s]
 99%|█████████▉| 2080/2100 [00:31<00:00, 81.76it/s]

insufficient metabolites: 
['C10H20NO8P+Na', 'C20H34O5+K', 'C20H34O5+Na', 'C10H20NO8P+H', 'C10H20NO8P+K', 'C20H34O5+H', 'C21H38O3+H', 'C21H38O3+K', 'C21H39O6P+H', 'C21H38O3+Na', 'C21H39O6P+K', 'C21H40O3+H', 'C21H39O6P+Na', 'C21H40O3+K', 'C21H40O3+Na', 'C21H41O6P+H', 'C21H41O6P+Na', 'C21H41O7P+H', 'C21H41O6P+K', 'C21H41O7P+K', 'C21H44NO7P+H', 'C21H44NO7P+K', 'C22H44NO6P+H', 'C21H44NO7P+Na', 'C21H41O7P+Na', 'C22H44NO6P+K', 'C22H46NO7P+K', 'C22H46NO7P+Na', 'C22H46NO7P+H', 'C23H36O3+K', 'C23H36O3+H', 'C22H44NO6P+Na', 'C23H36O3+Na', 'C23H39O7P+K', 'C23H46NO6P+H', 'C23H39O7P+H', 'C23H39O7P+Na', 'C23H46NO6P+K', 'C23H47O11P+Na', 'C23H46NO6P+Na', 'C23H47O11P+K', 'C23H47O11P+H', 'C23H48NO7P+H', 'C23H48NO7P+Na', 'C23H48NO7P+K', 'C24H43O8P+K', 'C24H43O8P+H', 'C24H43O8P+Na', 'C24H46NO8P+Na', 'C24H46NO8P+K', 'C24H48NO6P+Na', 'C24H46NO8P+H', 'C24H48NO6P+K', 'C24H48NO7P+H', 'C24H48NO7P+K', 'C24H48NO7P+Na', 'C24H50NO6P+H', 'C24H50NO6P+K', 'C24H50NO6P+Na', 'C25H44NO7P+H', 'C25H44NO7P+Na', 'C25H44NO7P+K'

100%|██████████| 2100/2100 [00:31<00:00, 67.08it/s]
100%|██████████| 2100/2100 [00:32<00:00, 65.49it/s]
100%|██████████| 2100/2100 [00:31<00:00, 66.22it/s]
100%|██████████| 2100/2100 [00:31<00:00, 65.75it/s]
100%|██████████| 2100/2100 [00:31<00:00, 65.96it/s]


insufficient metabolites: 
['C10H20NO8P+K', 'C10H20NO8P+H', 'C10H20NO8P+Na', 'C20H34O5+H', 'C20H34O5+K', 'C20H34O5+Na', 'C21H38O3+Na', 'C21H38O3+H', 'C21H39O6P+K', 'C21H39O6P+Na', 'C21H38O3+K', 'C21H39O6P+H', 'C21H40O3+Na', 'C21H40O3+K', 'C21H41O6P+H', 'C21H41O7P+K', 'C21H40O3+H', 'C21H41O6P+Na', 'C21H41O7P+Na', 'C21H41O6P+K', 'C21H44NO7P+H', 'C21H41O7P+H', 'C21H44NO7P+K', 'C21H44NO7P+Na', 'C22H44NO6P+H', 'C22H44NO6P+Na', 'C22H46NO7P+K', 'C22H44NO6P+K', 'C22H46NO7P+H', 'C22H46NO7P+Na', 'C23H36O3+H', 'C23H36O3+K', 'C23H39O7P+Na', 'C23H39O7P+K', 'C23H36O3+Na', 'C23H46NO6P+H', 'C23H46NO6P+K', 'C23H46NO6P+Na', 'C23H47O11P+H', 'C23H47O11P+K', 'C23H39O7P+H', 'C23H48NO7P+K', 'C23H48NO7P+Na', 'C23H47O11P+Na', 'C24H43O8P+K', 'C24H43O8P+Na', 'C24H46NO8P+H', 'C24H46NO8P+K', 'C24H48NO6P+K', 'C24H43O8P+H', 'C23H48NO7P+H', 'C24H50NO6P+H', 'C24H48NO7P+Na', 'C24H48NO7P+H', 'C24H48NO7P+K', 'C24H46NO8P+Na', 'C24H50NO6P+K', 'C24H50NO6P+Na', 'C25H44NO7P+Na', 'C25H44NO7P+K', 'C25H45O8P+H', 'C25H46NO7P+H', 

  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")
  self._set_dim_df(value, "obs")


KeyboardInterrupt: 

In [None]:
# pd.concat([corr_sm_matrix.to_df().replace(0, np.nan)['C10H20NO8P+K'], sm_matrix.to_df().replace(0, np.nan)['C10H20NO8P+K'], total_pixel_overlap.replace(0, np.nan)], axis=1).dropna()

Unnamed: 0,C10H20NO8P+K,C10H20NO8P+K.1,total_pixel_area
pixel_80,216.316875,406.304199,0.694704
pixel_180,10.714414,103.297676,0.139438
pixel_189,38.253912,144.616760,0.349585
pixel_229,1.212734,117.070709,0.014523
pixel_230,90.328211,117.070709,1.000000
...,...,...,...
pixel_6145,23.144753,254.800949,0.122407
pixel_6162,1.284767,158.389786,0.011423
pixel_6230,172.105505,447.623291,0.504683
pixel_6231,334.563476,482.055847,0.901247
