# Mx_Coculture: Correction

SpaceM datasets are usually stored as annotated data-matrices, separately for individual wells. With this notebooks, these individual files are corrected for ion suppression on the pixel-level and then deconvoluted to cell-level. All resulting files are saved separately by well to the target_path and the impact of the correction briefly shown for visual inspection.

In [None]:
import os
import platform
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import statistics as st
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import statsmodels.formula.api as smf
import seaborn as sns
import re
from importlib import reload
import json
import sys
sys.path.append('/home/mklein/spacem')
sys.path.append('/home/mklein/FDA_project')

from src.correction import (add_normalization_factors, 
                            correct_quantile_inplace,
                            deconvolution_spacem,
                            get_overlap_data,
                            add_overlap_matrix_spacem, 
                            get_reference_pool
                           )
from src import const 
from SpaceM.lib.modules import (
    overlap_analysis,
    single_cell_analysis_normalization
)

%matplotlib inline

The original data lies on the groups shared data storage. Corrected files will be saved in a separate location, preserving the well-specific folder structure.

In [None]:
if platform.system() == "Darwin":
    source_path = '/Volumes/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
else:
    source_path = '/g/alexandr/smenon/2022-07-13_Glioblastoma/processed_files'
    target_path = '/home/mklein/FDA_project/data/Lx_Glioblastoma'
    
deconv_default_min_overlap = 0.0

In [None]:
# Parameters
source_path = "/home/mklein/Raw Data/Coculture"
target_path = "/home/mklein/FDA_project/data/Mx_Co_Cultured"
condition_name = "condition"
well_name = "rowcol"
deconv_default_min_overlap = 0.3
analysis_path = "/home/mklein/FDA_project/analysis/Mx_Coculture"
notebooks = [
    "pipeline_01_correction.ipynb",
    "pipeline_02_processing.ipynb",
    "pipeline_03_evaluation.ipynb",
]
project = "Mx_Coculture"
correction_proportion_threshold = 1e-3

In [None]:
samples = []
for dirpath, dirnames, filenames in os.walk(source_path):
        if 'analysis' in dirnames:
            samples.append(re.sub(source_path+'/?', '', dirpath))
samples

In [None]:
files = {
        'config': '../config.json',
        'sm_matrix': 'ablation_mark_analysis/spatiomolecular_adata.h5ad',
        'overlap_regions': 'overlap_analysis2/overlap.regions.csv',
        'mark_regions': 'overlap_analysis2/ablation_mark.regions.csv',
        'cell_regions': 'overlap_analysis2/cell.regions.csv',
        'cell_sm_matrix': 'single_cell_analysis/spatiomolecular_adata.h5ad',
    }

In [None]:
def assign_average_tpo(am_adata, overlap_data, min_overlap, method=np.mean):
    if min_overlap is None:
        min_overlap = 0
    
    overlap = overlap_data.overlap_regions
    overlap['am_id'] = overlap['am_id'].astype(str)
    overlap['cell_id'] = overlap['cell_id'].astype(str)
    merged_df = pd.merge(overlap[['am_id', 'cell_id']], am_adata.obs[const.TPO], left_on='am_id', right_index=True)
    merged_df = merged_df[merged_df[const.TPO] >= min_overlap]
    
    mean_df = merged_df[['cell_id', 'correction_total_pixel_overlap']].groupby('cell_id', group_keys=False).agg(method)
#     mean_df = merged_df[['cell_id', 'correction_total_pixel_overlap']].groupby('cell_id', group_keys=False).agg(lambda x: method(x))
    return mean_df[const.TPO]

In [None]:
def correct_sample_spacem(sample):
    
    sample_path = os.path.join(source_path, sample, "analysis")
    sample = re.sub('/', '_', sample)
    
    if not os.path.exists(os.path.join(target_path, sample)):
        os.makedirs(os.path.join(target_path, sample))

    # get appropriate file paths for the processed well
    project_files = {k: os.path.join(sample_path, v) for k, v in files.items()}

    if os.path.exists(project_files['config']):
        with open(project_files['config']) as json_file:
            data = json.load(json_file)
        deconv_info = data['single_cell_analysis']
        if deconv_info['ablation_marks_min_overlap_ratio'] is None:
            deconv_info['ablation_marks_min_overlap_ratio'] = deconv_default_min_overlap
    else:
        deconv_info = {'cell_normalization_method': 'weighted_by_overlap_and_sampling_area', 
                       'ablation_marks_min_overlap_ratio': 0
        }
        print('No well config file found. Using default deconvolution parameters.')
    # load required files
    cell_regions = pd.read_csv(project_files['cell_regions'])
    mark_regions = pd.read_csv(project_files['mark_regions'])
    overlap_regions = pd.read_csv(project_files['overlap_regions'])
    
    sm_matrix = sc.read(os.path.join(sample_path, files['sm_matrix']))
    cell_sm_matrix = sc.read(os.path.join(sample_path, files['cell_sm_matrix']))
       
    
    add_overlap_matrix_spacem(sm_matrix, cell_regions, mark_regions, overlap_regions)
    add_normalization_factors(adata=sm_matrix, method=st.median)
        
    # perform the actual quantile regression
    corr_sm_matrix = correct_quantile_inplace(adata=sm_matrix, 
        reference_ions= get_reference_pool(sm_matrix), 
        correct_intersect = True,
        proportion_threshold = correction_proportion_threshold,
        n_jobs=5)
    
    # perform pixel-cell-deconvolution
    overlap_data = get_overlap_data(cell_regions, mark_regions, overlap_regions)
    # corr_cell_sm_matrix = deconvolution_spacem(adata=corr_sm_matrix, 
    #     overlap_data=overlap_data,
    #     raw_adata=cell_sm_matrix,
    #     deconvolution_params=deconv_info)
    # gen_cell_sm_matrix = deconvolution_spacem(adata=sm_matrix,
    #     overlap_data=overlap_data,
    #     raw_adata=cell_sm_matrix,
    #     deconvolution_params=deconv_info)
    
    from src.correction import deconvolution_rappez, get_matrices_from_dfs, add_matrices
    overlap_matrix, sampling_spec_matrix = get_matrices_from_dfs(mark_area=mark_regions, cell_area=cell_regions, marks_cell_overlap=overlap_regions)
    add_matrices(adata = sm_matrix, overlap_matrix = overlap_matrix, sampling_spec_matrix = sampling_spec_matrix)
    add_matrices(adata = corr_sm_matrix, overlap_matrix = overlap_matrix, sampling_spec_matrix = sampling_spec_matrix)
    
    gen_cell_sm_matrix = deconvolution_rappez(sm_matrix, raw_adata=cell_sm_matrix)
    corr_cell_sm_matrix = deconvolution_rappez(corr_sm_matrix, raw_adata=cell_sm_matrix)

    # hand over TPOs to spatiomolecular matrix for downstream analysis
    min_overlap = deconv_info['ablation_marks_min_overlap_ratio']
    corr_cell_sm_matrix.obs['list_TPO'] = assign_average_tpo(sm_matrix, overlap_data, min_overlap, method=lambda x: ";".join(x.astype(str)))
    gen_cell_sm_matrix.obs['list_TPO'] = assign_average_tpo(sm_matrix, overlap_data, min_overlap, method=lambda x: ";".join(x.astype(str)))
    
    # write the generated files to the dedicated project location.
    # corr_sm_matrix.write(os.path.join(target_path, sample, 'am_spatiomolecular_adata_corrected.h5ad'))
    # sm_matrix.write(os.path.join(target_path, sample, 'am_spatiomolecular_adata.h5ad'))
    # corr_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_corrected.h5ad'))
    # cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata_spacem.h5ad'))
    # gen_cell_sm_matrix.write(os.path.join(target_path, sample, 'cells_spatiomolecular_adata.h5ad'))
    #               deconv using own implementation                                                  deconv by Martijn
    return (sample, gen_cell_sm_matrix, corr_cell_sm_matrix, sm_matrix, corr_sm_matrix, deconv_info, cell_sm_matrix)

This is the actual correction pipeline.

In [None]:
# adata_list = Parallel(n_jobs=7)(delayed(correct_sample_spacem)(sample) for sample in tqdm(samples))
adata_list = [correct_sample_spacem(sample) for sample in tqdm(['dataset'])]

In [None]:
# reshape data for immediate analysis
gen_adata_dict = {item[0]: item[1] for item in adata_list}
adata_cor_dict = {item[0]: item[2] for item in adata_list}
am_adata_dict = {item[0]: item[3] for item in adata_list}
am_adata_cor_dict = {item[0]: item[4] for item in adata_list}
adata_dict = {item[0]: item[6] for item in adata_list}

am_adata = ad.concat(am_adata_dict, label='well', index_unique="_", merge="first")
am_adata_cor = ad.concat(am_adata_cor_dict, label='well', index_unique="_", merge='first')
gen_adata = ad.concat(gen_adata_dict, label='well', index_unique="_", merge="first")
adata = ad.concat(adata_dict, label='well', index_unique="_", merge="first")
adata_cor = ad.concat(adata_cor_dict, label='well', index_unique="_", merge="first")

deconv_dict = {item[0]: item[5] for item in adata_list}
deconv_table = pd.DataFrame(deconv_dict).T
deconv_table

The ion suppression correction takes a parameter proportion_threshold to filter the pixels used to compute quantile regression. In particular, pixels with a sampling proportion lower than the threshold are excluded for this step. In contrast, the actual correction is then performed on all pixels in the dataset. As a consequence, for the majority of the dataset, the dependence of the data on the factor "sampling_proportion" is not completely removed by the correction. Instead, depending on the difference in slope between the complete and thresholded set of pixels, the corrected set of pixels has a positive or negative dependence on the factor "sampling_proportion". The larger the proportion_threshold, the stronger this deviation can get.

In the following figure, these differences are visualized for a subset of ions (with very low/high correction slope) and wells. The top panel shows the logarithmic ion intensity/proportion ratio plotted against the log proportion ratio of the respective pixels. As this is the relationship used to compute the quantile regression, the resulting regression lines can also be shown. The black lines show the quantile regression of the total set of pixels in a well, the red lines for the thresholded set (shown as orange squares). In turn, the blue squares are disregarded in calculating the correction slope. The bottom panel shows the corresponding corrected sets of pixels with black lines again representing the quantile regression on the complete set of pixels.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
am_adata.obs[const.TPO] = am_adata_cor.obs[const.TPO]

In [None]:
from src.evaluation import compare_pre_post_correction
compare_pre_post_correction(am_adata, am_adata_cor, proportion_threshold=correction_proportion_threshold)

In [None]:
import src.evaluation
reload(src.evaluation)
import src.const
reload(src.const)
from src.evaluation import compare_pre_post_correction
compare_pre_post_correction(am_adata, am_adata_cor, proportion_threshold=correction_proportion_threshold, ions=['C20H32O2', 'C35H69O8P'])