# High-throughput ion suppression correction

SpaceM datasets are usually stored as annotated data-matrices, separately for individual wells. With this notebooks, these individual files are corrected for ion suppression on the pixel-level and then deconvoluted to cell-level. All resulting files are saved separately by well to the target_path and the impact of the correction briefly shown for visual inspection.

In [1]:
import os
import platform
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import statistics as st
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf
import seaborn as sns
import re
from importlib import reload
import sys
sys.path.append('/home/mklein/spacem')
sys.path.append('/home/mklein/FDA_project')

from src.correction import (add_normalization_factors, 
                            correct_quantile_inplace,
                            deconvolution_spacem,
                            get_overlap_data,
                            add_overlap_matrix_spacem
                           )
from src import const 
from SpaceM.lib.modules import (
    overlap_analysis,
    single_cell_analysis_normalization
)

The original data lies on the groups shared data storage. Corrected files will be saved in a separate location, preserving the well-specific folder structure.

In [2]:
if platform.system() == "Darwin":
    source_path = '/Volumes/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
else:
    source_path = '/g/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/home/mklein/FDA_project/data/Lx_Glioblastoma'

In [3]:
# Parameters
source_path = "/g/alexandr/smenon/2022-07-13_Glioblastoma/processed_files"
target_path = "/home/mklein/FDA_project/data/Lx_Glioblastoma"
condition_name = "condition"
well_name = "rowcol"


In [4]:
samples = []
for dirpath, dirnames, filenames in os.walk(source_path):
        if 'analysis' in dirnames:
            samples.append(re.sub(source_path+'/?', '', dirpath))
samples

['B1',
 'B2',
 'B3',
 'B4',
 'C1',
 'C2',
 'C3',
 'C4',
 'D1',
 'D2',
 'D3',
 'D4',
 'E1',
 'E2',
 'E3',
 'E4',
 'F1',
 'F2',
 'F3',
 'F4',
 'G1',
 'G2',
 'G3',
 'G4',
 'H2',
 'H3',
 'H4',
 'I1',
 'I2',
 'I3',
 'I4',
 'J1',
 'J2',
 'J3',
 'J4']

In [5]:
files = {
        'config': '../config.json',
        'sm_matrix': 'ablation_mark_analysis/spatiomolecular_adata.h5ad',
        'overlap_regions': 'overlap_analysis2/overlap.regions.csv',
        'mark_regions': 'overlap_analysis2/ablation_mark.regions.csv',
        'cell_regions': 'overlap_analysis2/cell.regions.csv',
        'cell_sm_matrix': 'single_cell_analysis/spatiomolecular_adata.h5ad',
    }

In [6]:
def correct_sample_spacem(sample):
    
    sample_path = os.path.join(source_path, sample, "analysis")
    sample = re.sub('/', '_', sample)
    
    if not os.path.exists(os.path.join(target_path, sample)):
        os.makedirs(os.path.join(target_path, sample))

    # get appropriate file paths for the processed well
    project_files = {k: os.path.join(sample_path, v) for k, v in files.items()}

    # load required files
    cell_regions = pd.read_csv(project_files['cell_regions'])
    mark_regions = pd.read_csv(project_files['mark_regions'])
    overlap_regions = pd.read_csv(project_files['overlap_regions'])
    
    sm_matrix = sc.read(os.path.join(sample_path, files['sm_matrix']))
    cell_sm_matrix = sc.read(os.path.join(sample_path, files['cell_sm_matrix']))
       
    add_overlap_matrix_spacem(sm_matrix, cell_regions, mark_regions, overlap_regions)
    
    add_normalization_factors(adata=sm_matrix, method=st.median)

    # perform the actual quantile regression
    corr_sm_matrix = correct_quantile_inplace(adata=sm_matrix, 
        reference_ions=sm_matrix.var_names, 
        correct_intersect = True,
        n_jobs=6)
    
    # perform pixel-cell-deconvolution
    overlap_data = get_overlap_data(cell_regions, mark_regions, overlap_regions)
    corr_cell_sm_matrix = deconvolution_spacem(adata=corr_sm_matrix, 
        overlap_data=overlap_data,
        raw_adata=cell_sm_matrix)
    gen_cell_sm_matrix = deconvolution_spacem(adata=sm_matrix,
        overlap_data=overlap_data,
        raw_adata=cell_sm_matrix)
    # write the generated files to the dedicated project location.
    corr_sm_matrix.write(os.path.join(target_path, sample, 'am_spatiomolecular_adata_corrected.h5ad'))
    sm_matrix.write(os.path.join(target_path, sample, 'am_spatiomolecular_adata.h5ad'))
    corr_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_corrected.h5ad'))
    cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_spacem.h5ad'))
    gen_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata.h5ad'))
    
    return (sample, cell_sm_matrix, corr_cell_sm_matrix, sm_matrix, corr_sm_matrix)

This is the actual correction pipeline.

In [7]:
adata_list = Parallel(n_jobs=5)(delayed(correct_sample_spacem)(sample) for sample in tqdm(samples))
# adata_list = [correct_sample_spacem(sample) for sample in tqdm(['B1'])]

  0%|                                                                                                                                     | 0/35 [00:00<?, ?it/s]

 29%|███████████████████████████████████▍                                                                                        | 10/35 [00:54<02:15,  5.41s/it]

 43%|█████████████████████████████████████████████████████▏                                                                      | 15/35 [01:46<02:29,  7.48s/it]













 57%|██████████████████████████████████████████████████████████████████████▊                                                     | 20/35 [02:36<02:07,  8.49s/it]















 71%|████████████████████████████████████████████████████████████████████████████████████████▌                                   | 25/35 [03:26<01:29,  8.97s/it]













 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 30/35 [04:20<00:48,  9.60s/it]













100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [05:23<00:00, 10.56s/it]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [05:23<00:00,  9.25s/it]




















In [8]:
# reshape data for immediate analysis
am_adata_dict = {item[0]: item[3] for item in adata_list}
am_adata_cor_dict = {item[0]: item[4] for item in adata_list}
adata_dict = {item[0]: item[1] for item in adata_list}
adata_cor_dict = {item[0]: item[2] for item in adata_list}

am_adata = ad.concat(am_adata_dict, label='well', index_unique="_", merge="first")
am_adata_cor = ad.concat(am_adata_cor_dict, label='well', index_unique="_", merge='first')
adata = ad.concat(adata_dict, label='well', index_unique="_", merge="first")
adata_cor = ad.concat(adata_cor_dict, label='well', index_unique="_", merge="first")

Every analysed pixel is characterized by its total overlap with cellular regions. The raw data shows no clear association between this overlap and acquired ion intensities. However, after the ion suppression correction, pixels with smaller overlap clearly have lower corresponding intensities (only shown for one metabolite). 

In [9]:
import warnings
warnings.filterwarnings('ignore')

def plot_all_wells(adata, ion = 'C24H48NO6P+H', col = 'well', x = 'total_pixel_overlap'):
    plot_df = sc.get.obs_df(adata, keys=[col, ion, x])
    # plot_df = plot_df[plot_df['well'].isin(samples[:10])]
    # plot_df= plot_df[plot_df['am_sampling_ratio'] > 0]
    plot_df= plot_df[plot_df[ion] > 0]
    # plot_df[ion] = plot_df[ion] / plot_df['am_sampling_ratio']
    plot_df[col].cat.remove_unused_categories(inplace=True)
    graph = sns.FacetGrid(plot_df, col=col, col_wrap=7)
    graph.map(sns.regplot, x, ion).add_legend()# .set(yscale ='log', xscale='log')
    
    model = smf.quantreg('Q("' + ion + '") ~ '+x, plot_df)
    qrmodel = model.fit(q=0.5)
    print(qrmodel.params)

In [10]:
am_adata.obs['total_pixel_overlap'] = am_adata_cor.obs['total_pixel_overlap']
plot_all_wells(am_adata)

Intercept              639.530579
total_pixel_overlap    285.509090
dtype: float64


In [11]:
plot_all_wells(am_adata_cor)

Intercept               -7.169836
total_pixel_overlap    723.747873
dtype: float64


The plots above are based on the total_pixel_overlap measure, as computed within the correction procedure. The anndata object provided by SpaceM already contain a measure called am_sampling_ratio, which should have the same meaning as the computed total_pixel_overlap. However, the two measures do not correlate at all. Consequently, the corrected ion intensity data does not show a dependency on am_sampling_ratio (see below).

In [12]:
plot_all_wells(am_adata, ion = 'am_sampling_ratio')

Intercept              7.799576e-16
total_pixel_overlap    1.000000e+00
dtype: float64


In [13]:
plot_all_wells(am_adata_cor, x = 'am_sampling_ratio')

Intercept             -7.169836
am_sampling_ratio    723.747873
dtype: float64
