<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [5]</a>'.</span>

# Evaluation of ion suppression correction

In this notebook, different measures are investigated to quantify the effect of correcting SpaceM ion intensity data for partial pixel-cell overlap.
Moreover, The effects of the correction on different metabolites is visualized.
 

In [1]:
import platform
import os
import multiprocessing
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import seaborn as sns
from sklearn.cluster import KMeans
import re
import outer_spacem as osm
import sys
sys.path.append('/home/mklein/spacem')
sys.path.append('/Volumes/mklein/spacem')
sys.path.append('/home/mklein/FDA_project')
from src.correction import *
from src.evaluation import intermixing, MetaboliteAnalysis

%matplotlib inline
%config InlineBackend.figure_formats = ['retina']

In [2]:
if platform.system() == "Darwin":
    target_path = '/Volumes/mklein/FDA_project/data/Lx_Glioblastoma'
    if True:
        target_path = '/Users/mariusklein/Local_Project_Files/FDA_project/data/Lx_Glioblastoma'

else:
    target_path = '/home/mklein/FDA_project/data/Lx_Glioblastoma'

condition_name = 'condition'
well_name = 'rowcol'
project = 'Lx_Glioblastoma'
analysis_path = target_path

In [3]:
# Parameters
source_path = "/home/mklein/Raw Data/2022-01-31_PancreaticCancer"
target_path = "/home/mklein/FDA_project/data/Lx_Pancreatic_Cancer"
condition_name = "condition"
well_name = "rowcol"
analysis_path = "/home/mklein/FDA_project/analysis/Lx_Pancreatic_Cancer"
notebooks = [
    "pipeline_01_correction.ipynb",
    "pipeline_02_processing.ipynb",
    "pipeline_03_evaluation.ipynb",
]
project = "Lx_Pancreatic_Cancer"


Loading the uncorrected and ISM-corrected dataset from file. Additionally, loading the metadata CSV file to filter out excluded wells.

In [4]:
adata = ad.read(os.path.join(target_path, "gen_batch_sm_matrix.h5ad"))
adata_cor = ad.read(os.path.join(target_path, "corrected_batch_sm_matrix.h5ad"))

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [5]:
metadata_path = os.path.join(target_path, 'metadata.csv')
samples = list(set(adata.obs['well']))

if os.path.exists(metadata_path):
    metadata = pd.read_csv(metadata_path)
    if well_name not in metadata.columns:
        metadata[well_name] = metadata['row'].astype(str) + metadata['col'].astype(str)
    samples = list(metadata[well_name])

def assign_conditions(adata):
    index = adata.obs.index.name
    new_obs = adata.obs.reset_index()
    
    new_obs = pd.merge(new_obs, metadata[[well_name, condition_name]], 
                       how='inner', left_on='well', right_on=well_name).set_index(index)
    
    adata = adata[new_obs.index, :].copy()
    adata.obs = new_obs
    if 'keep_conditions' in globals():
        adata = adata[adata.obs[condition_name].isin(keep_conditions), :].copy()
    return adata

adata = assign_conditions(adata)
adata_cor = assign_conditions(adata_cor)

adata.obs[condition_name].value_counts()

NameError: name 'metadata' is not defined

The loaded datasets are preprocessed in the same way:

- cells need non-zero intensities for at least 10 ions.
- ions need non-zero intensities for at least 200 cells.
- intensties are normalized to TIC and log-transformed (log(x+1))

After that, both datasets are subset to contain the same ions and cells (intersection).

In [None]:
def preprocess(adata):
    
    sc.pp.filter_cells(adata, min_genes=10)
    sc.pp.filter_genes(adata, min_cells=200)
    adata.raw = adata
    # sc.pp.scale(adata)
    sc.pp.normalize_total(adata, target_sum=None)
    # sc.pp.log1p(adata)
    
    adata.var['median_intensity'] = np.median(adata.X, axis=0)
    adata.var['mean_intensity'] = np.mean(adata.X, axis=0)
    # adata_x = adata.X.copy()
    # adata_x[adata_x == 0] = np.nan
    # adata.var['median_intensity_nonzero'] = np.nanmedian(adata_x, axis=0)
    
    
    
preprocess(adata)
preprocess(adata_cor)

print(adata.shape)
print(adata_cor.shape)


In [None]:
included_molecules = adata.var_names.intersection(adata_cor.var_names)
included_cells = adata.obs_names.intersection(adata_cor.obs_names)

def subset_molecules(adata):
    
    return adata[included_cells, included_molecules].copy()

adata = subset_molecules(adata)
adata_cor = subset_molecules(adata_cor)

print(adata.shape)
print(adata_cor.shape)

Before analysis, asserting that the two data files were deconvoluted in the same way. Specifically, the corrected dataframe cannot have non-zero values at positions where the uncorrected dataframe has zero values.

In [None]:
assert not any(pd.Series(np.array((adata.to_df() == 0) & (adata_cor.to_df()!= 0)).flatten()))

In [None]:
adata.obs['correction'] = 'uncorrected'
adata_cor.obs['correction'] = 'ISM correction'

## Effects of the correction on different molecules

The ISM correction is performed per ion on the logarithmized intensity / sampling proportion ratio. The underlying quantile regression can only be computed with a minimum number of datapoints. If an ion has less than 10 datapoints, the quantile regression is instead computed based on a reference pool of ions.
In the following, the resulting slopes by which all ions have been corrected are visualized. Ions that were corrected using the reference pool are shown separately.


In [None]:
grid = sns.FacetGrid(adata_cor.var[['mean_correction_quantreg_slope', 'corrected_only_using_pool']], col='corrected_only_using_pool', hue='corrected_only_using_pool', sharey=False)
grid.map(sns.kdeplot, 'mean_correction_quantreg_slope')
cor_pool = list(adata_cor.var[adata_cor.var['corrected_only_using_pool'] == True].index)

Based on the slopes of the correction but also the logfoldchanges between corrected and uncorrected cells, one can infer the extent of alteration of different metabolites in the correction. These measures not necessarily correlate, thus the degree of correction of ions has to be evaluated on individual datasets.

In [None]:
import src.evaluation
from importlib import reload
reload(src.evaluation)
from src.evaluation import MetaboliteAnalysis

In [None]:
ma_raw = MetaboliteAnalysis(adata=adata, adata_cor=adata_cor, condition_name = condition_name, 
                        obs_columns = ['list_TPO'],
                        var_columns = ['corrected_only_using_pool', 'mean_correction_quantreg_slope', 
                                       'n_cells','median_intensity', 'mean_intensity', 'sum_correction_using_ion_pool'],
                       use_raw = True)

In [None]:
ma_raw.pair_plot(exclude_ref_corrected = False)

In [None]:
ma_raw.volcano_plot(exclude_ref_corrected = False)

In [None]:
sc.pl.rank_genes_groups_tracksplot(ma_raw.conc_adata, groupby='correction', dendrogram=False)

In [None]:
ma_raw.quotient_plot()

In [None]:
ma_raw.top_ion_plot()

In [None]:
_ = ma_raw.save_matrix(save_to_path = analysis_path, safe_to_name = project)

The same analysis is then carried out for the TIC-corrected and log-transformed data: Here, the differences between uncorrected and ISM-corrected data are much more subtle. This corresponds better with the UMAPs further down, as they also show very little noticebly differences between uncorrected and ISM-corrected datasets.

In [None]:
ma = MetaboliteAnalysis(adata=adata, adata_cor=adata_cor, condition_name = condition_name, 
                        obs_columns = ['list_TPO'],
                        var_columns = ['corrected_only_using_pool', 'mean_correction_quantreg_slope', 
                                       'n_cells','median_intensity', 'mean_intensity', 'sum_correction_using_ion_pool'],
                       use_raw = False)

In [None]:
sc.pl.rank_genes_groups_tracksplot(ma.conc_adata, groupby='correction', dendrogram=False)

In [None]:
ma.quotient_plot()

In [None]:
ma.top_ion_plot()

## Comparison of the datasets

In the following, the uncorrected and ISM-corrected datasets are compared using methods of a typical single-cell analysis.

In [None]:
def dimred_pca(adata):
    sc.pp.pca(adata)
    sc.pl.pca_overview(adata, color=['well', condition_name], palette='cividis')

dimred_pca(adata)
dimred_pca(adata_cor)

In [None]:
def dimred_umap(adata, min_dist = 0.5):
    sc.pp.neighbors(adata, n_neighbors=50, metric='cosine')
    sc.tl.umap(adata, min_dist=min_dist, spread=1.0, random_state=1, n_components=2)
    sc.pl.umap(adata, color=['well', condition_name], palette='cividis')
    f = osm.pl.highlight_scatterplot(
        data = adata,
        obsm_key = "X_umap",
        hue = condition_name,
        col = condition_name,
        palette = "cividis",
        trim_axes=True,
        height = 5,
        scatter_kwargs = dict(s=5)
    )

    f.add_legend(markerscale=3)


dimred_umap(adata)
dimred_umap(adata_cor)


In [None]:
# from sklearn.metrics.cluster import completeness_score
# from sklearn.metrics import accuracy_score, silhouette_score
# 
# def kmeans_clust(adata):
#     n_clusters = len(adata.obs[condition_name].value_counts())
#     kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(adata.X)
#     adata.obs['kmeans'] = kmeans.labels_.astype(str)
# 
#     sc.tl.leiden(adata, resolution=2)
# 
#     leiden = np.array(adata.obs['leiden'].values)
#     leiden_curated = np.copy(leiden)
#     fc = np.array(adata.obs[condition_name].values)
#     for cluster in np.unique(leiden):
#         labels, counts = np.unique(fc[leiden == cluster], return_counts=True)
#         leiden_curated[leiden == cluster] = str(labels[counts == np.max(counts)][0])
#     adata.obs['leiden_curated'] = leiden_curated
# 
#     sc.pl.umap(adata, color=['kmeans', 'leiden', 'leiden_curated', condition_name], palette='cividis')
#     # print('Leiden acccuracy score: %1.4f' % accuracy_score(y_true = adata.obs[condition_name].replace(['HeLa', 'NIH3T3'], ['0', '1']), y_pred = adata.obs['leiden']))
#     print('Curated leiden acccuracy score: %1.4f' % accuracy_score(y_true = adata.obs[condition_name], y_pred = adata.obs['leiden_curated']))
#     print('KMeans completeness score: %1.4f' % completeness_score(adata.obs[condition_name], adata.obs['kmeans']))
#     print('KMeans silhouette coefficient: %1.4f' % silhouette_score(adata.X, adata.obs['kmeans']))
# 
# kmeans_clust(adata)
# kmeans_clust(adata_cor)
# 

In [None]:
summaries = intermixing({'uncorrected': adata, 'ISM correction': adata_cor}, condition_name = condition_name, measures = ['X_pca', 'X_umap'])

In [None]:
s = intermixing(
    adata_dict = {'uncorrected': adata, 'ISM correction': adata_cor},
    condition_name = condition_name,
    sample_frac=0.1,
    measures =['X_umap', 'X_pca'],
    n_datapoints = 50,
    sample_log = True,
    neighborhood_size = None,
    normalized = False,
    show_table = [],
    n_jobs = multiprocessing.cpu_count()
)

In [None]:
from scipy.integrate import trapz, simps

def auc_intermixing(summary_dict):
    for name, data in summary_dict.items():
        print('Area under the curve for %s: %1.4f'%(name, trapz(data['mean'], data.index) / max(data.index)))


auc_intermixing(s[1])

In [None]:
from sklearn.svm import LinearSVC

def get_svm_margin(adata, size_factor = 1):
    predictors = adata.X * size_factor
    result = adata.obs[condition_name]
    clf = LinearSVC(random_state=0, dual=False)
    clf.fit(predictors, result)  
    margin_df = pd.DataFrame({'condition': clf.classes_, 'margin': 1 / np.sqrt(np.sum(clf.coef_**2, axis=1))})
    #print(margin_df)
    return margin_df

df = pd.merge(get_svm_margin(adata), get_svm_margin(adata_cor, size_factor = np.sum(adata.X) / np.sum(adata_cor.X)), on='condition', suffixes=['_uncorrected', '_ISM_corrected'])

In [None]:
sns.set(rc={"figure.figsize":(12, 5)})
sns.barplot(df.melt(id_vars='condition', var_name='correction', value_name='margin'), x='condition', y='margin', hue='correction')

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

def LDA(adata):
    predictors = adata.X
    result = adata.obs[condition_name]
    model = LinearDiscriminantAnalysis()
    model.fit(predictors, result)  

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
    scores = cross_val_score(model, predictors, result, scoring='accuracy', cv=cv, n_jobs=multiprocessing.cpu_count()-5)
    
    adata.obs['lda'] = model.predict(adata.X)
    sc.pl.umap(adata, color=[condition_name, 'lda'], palette='cividis')
    print("LDA accuracy after 10-fold cross-validation: %1.4f (±%1.4f)" % (np.mean(scores), np.std(scores)))  


LDA(adata) 
LDA(adata_cor) 