# Lx_Glioblastoma: Evaluation

In this notebook, different measures are investigated to quantify the effect of correcting SpaceM ion intensity data for partial pixel-cell overlap.
Moreover, The effects of the correction on different metabolites is visualized.
 

In [None]:
import platform
import os
import multiprocessing
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import seaborn as sns
from sklearn.cluster import KMeans
import re
import outer_spacem as osm
import sys
sys.path.append('/home/mklein/spacem')
sys.path.append('/Volumes/mklein/spacem')
sys.path.append('/home/mklein/FDA_project')
sys.path.append('/Volumes/mklein/FDA_project')
from src.correction import *
from src.evaluation import intermixing, MetaboliteAnalysis

%matplotlib inline

In [None]:
# Parameters
source_path = "/g/alexandr/smenon/2022-07-13_Glioblastoma/processed_files"
target_path = "/home/mklein/FDA_project/data/Lx_Glioblastoma"
condition_name = "condition"
well_name = "rowcol"
deconv_default_min_overlap = 0.3
analysis_path = "/home/mklein/FDA_project/analysis/Lx_Glioblastoma"
project = "Lx_Glioblastoma"


Loading the uncorrected and ISM-corrected dataset from file. Additionally, loading the metadata CSV file to filter out excluded wells.

In [None]:
adata = ad.read(os.path.join(target_path, "gen_batch_sm_matrix.h5ad"))
adata_cor = ad.read(os.path.join(target_path, "corrected_batch_sm_matrix.h5ad"))

In [None]:
metadata_path = os.path.join(target_path, 'metadata.csv')
samples = list(set(adata.obs['well']))

if os.path.exists(metadata_path):
    metadata = pd.read_csv(metadata_path)
    if well_name not in metadata.columns:
        metadata[well_name] = metadata['row'].astype(str) + metadata['col'].astype(str)
    samples = list(metadata[well_name])

def assign_conditions(adata):
    index = adata.obs.index.name
    new_obs = adata.obs.reset_index()
    
    new_obs = pd.merge(new_obs, metadata[[well_name, condition_name]], 
                       how='inner', left_on='well', right_on=well_name).set_index(index)
    
    adata = adata[new_obs.index, :].copy()
    adata.obs = new_obs
    if 'keep_conditions' in globals():
        adata = adata[adata.obs[condition_name].isin(keep_conditions), :].copy()
    return adata

adata = assign_conditions(adata)
adata_cor = assign_conditions(adata_cor)

In [None]:
pd.crosstab(adata.obs['well'], adata.obs[condition_name], margins=True)

In [None]:
included_molecules = adata.var_names.intersection(adata_cor.var_names)
included_cells = adata.obs_names.intersection(adata_cor.obs_names)

def subset_molecules(adata):
    
    return adata[included_cells, included_molecules].copy()

adata = subset_molecules(adata)
adata_cor = subset_molecules(adata_cor)

print(adata.shape)
print(adata_cor.shape)

First of all, the loaded datasets are filtered:

- cells need non-zero intensities for at least 10 ions.
- ions need non-zero intensities for at least 200 cells.

After that, the sets are preprocessed in different ways:

- intensties are normalized to TIC and/or log-transformed (log(x+1))

After that, both datasets are subset to contain the same ions and cells (intersection).

In [None]:
def preprocess(adata):
    
    sc.pp.filter_cells(adata, min_genes=10)
    sc.pp.filter_genes(adata, min_cells=200)
    adata.raw = adata
    adata.layers["raw_counts"] = adata.X.copy()
    # sc.pp.scale(adata)
    adata.layers["norm_counts"] = sc.pp.normalize_total(adata, layer='raw_counts', target_sum=None, inplace=False)['X']
    adata.layers["1e4_norm_counts"] = sc.pp.normalize_total(adata, layer='raw_counts', target_sum=1e4, inplace=False)['X']
    
    adata.layers["log_raw_counts"] = sc.pp.log1p(adata.layers["raw_counts"], copy=True)
    adata.layers["log_norm_counts"] = sc.pp.log1p(adata.layers["norm_counts"], copy=True)
    adata.layers["1e4_log_norm_counts"] = sc.pp.log1p(adata.layers["1e4_norm_counts"], copy=True)
    adata.X = adata.layers["log_norm_counts"]
    
    adata.var['median_intensity'] = np.median(adata.X, axis=0)
    adata.var['mean_intensity'] = np.mean(adata.X, axis=0)
    # adata_x = adata.X.copy()
    # adata_x[adata_x == 0] = np.nan
    # adata.var['median_intensity_nonzero'] = np.nanmedian(adata_x, axis=0)
    
    
preprocess(adata)
preprocess(adata_cor)

print(adata.shape)
print(adata_cor.shape)


In [None]:
def dimred_umap(adata, layer=None, min_dist=0.5, label=""):
    if layer is not None:
        adata.layers['default_X'] = adata.X
        adata.X = adata.layers[layer]
    
    sc.pp.pca(adata)
    sc.pp.neighbors(adata, n_neighbors=50, metric='cosine')
    sc.tl.umap(adata, min_dist=min_dist, spread=1.0, random_state=1, n_components=2)
    # sc.pp.neighbors(adata, random_state=12345)
    # sc.tl.umap(adata, random_state=12345)
    sc.pl.umap(adata, color=['well', condition_name], palette='cividis', title=['well', label])
    f = osm.pl.highlight_scatterplot(
        data = adata,
        obsm_key = "X_umap",
        hue = condition_name,
        col = condition_name,
        palette = "cividis",
        trim_axes=True,
        height = 2.5,
        scatter_kwargs = dict(s=5)
    )

    f.add_legend(markerscale=3)
    
    if layer is not None:
        adata.X = adata.layers['default_X']

In [None]:
def intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer=None):
    if layer is not None:
        adata.layers['default_X'] = adata.X
        adata.X = adata.layers[layer]
        adata_cor.layers['default_X'] = adata_cor.X
        adata_cor.X = adata_cor.layers[layer]
    
    summaries = intermixing({'uncorrected': adata, 'ISM correction': adata_cor}, condition_name = condition_name, measures = measures)
    
    if layer is not None:
        adata.X = adata.layers['default_X']
        adata_cor.X = adata_cor.layers['default_X']
    
    return
    

The different options for scaling and transforming the data are shown in the following:

1. raw values
2. log transformation
3. TIC normalization
4. TIC normalization and log transformation
5. normalization to a fixed count (10^4)
6. normalization to a fixed count (10^4) and log transformation

The normalization to a fixed count has a slightly different effect than TIC normalization. The former normalizes all counts per cell to the given target sum so that all cells from the uncorrected and the corrected set are scaled to this count. In contrast, the latter retains the differences between the datasets by normalizing to the median count across cells in a dataset. Across modes of normalization, the corrected dataset shows a better separation visually and using the intermixing metric. However, this effect is very subtle after the data was log-transformed.

In [None]:
# dimred_umap(adata, layer='raw_counts')
# dimred_umap(adata_cor, layer='raw_counts')
# intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='raw_counts')

In [None]:
# dimred_umap(adata, layer='log_raw_counts')
# dimred_umap(adata_cor, layer='log_raw_counts')
# intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='log_raw_counts')

In [None]:
# dimred_umap(adata, layer='norm_counts')
# dimred_umap(adata_cor, layer='norm_counts')
# intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='norm_counts')

In [None]:
dimred_umap(adata, layer='log_norm_counts', label="uncorrected")
dimred_umap(adata_cor, layer='log_norm_counts', label="ISM correction")
intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='log_norm_counts')

In [None]:
# dimred_umap(adata, layer='1e4_norm_counts')
# dimred_umap(adata_cor, layer='1e4_norm_counts')
# intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='1e4_norm_counts')

In [None]:
dimred_umap(adata, layer='1e4_log_norm_counts', label="uncorrected")
dimred_umap(adata_cor, layer='1e4_log_norm_counts', label="ISM correction")
intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer='1e4_log_norm_counts')

Before analysis, asserting that the two data files were deconvoluted in the same way. Specifically, the corrected dataframe cannot have non-zero values at positions where the uncorrected dataframe has zero values.

In [None]:
assert not any(pd.Series(np.array((adata.to_df() == 0) & (adata_cor.to_df()!= 0)).flatten()))

In [None]:
adata.obs['correction'] = 'uncorrected'
adata_cor.obs['correction'] = 'ISM correction'

## Effects of the correction on different molecules

The ISM correction is performed per ion on the logarithmized intensity / sampling proportion ratio. The underlying quantile regression can only be computed with a minimum number of datapoints. If an ion has less than 10 datapoints, the quantile regression is instead computed based on a reference pool of ions.
In the following, the resulting slopes by which all ions have been corrected are visualized. Ions that were corrected using the reference pool are shown separately.

**For simplicity, not all modes of preprocessing are investigated in theis step. Only the non-transformed raw counts and the TIC-normalized and log-transformed counts are included here.**


In [None]:
grid = sns.FacetGrid(adata_cor.var[['mean_correction_quantreg_slope', 'corrected_only_using_pool']], col='corrected_only_using_pool', hue='corrected_only_using_pool', sharey=False)
grid.map(sns.histplot, 'mean_correction_quantreg_slope', bins=30)
cor_pool = list(adata_cor.var[adata_cor.var['corrected_only_using_pool'] == True].index)
adata_cor.var['corrected_only_using_pool'].value_counts()

In [None]:
def median_iqr(data):
    median = np.median(data)
    # First quartile (Q1)
    Q1 = np.percentile(data, 25, interpolation = 'midpoint')

    # Third quartile (Q3)
    Q3 = np.percentile(data, 75, interpolation = 'midpoint')

    # Interquaritle range (IQR)
    iqr = Q3 - Q1
    
    print("Median: %1.3f, (%1.4f)"%(median, iqr))
    
    return median, iqr

median_iqr(adata_cor.var['mean_correction_quantreg_slope'])

Based on the slopes of the correction but also the logfoldchanges between corrected and uncorrected cells, one can infer the extent of alteration of different metabolites in the correction. These measures not necessarily correlate, thus the degree of correction of ions has to be evaluated on individual datasets.

In [None]:
import src.evaluation
from importlib import reload
reload(src.evaluation)
from src.evaluation import MetaboliteAnalysis

In [None]:
ma_raw = MetaboliteAnalysis(adata=adata, adata_cor=adata_cor, condition_name = condition_name, 
                        obs_columns = ['list_TPO'],
                        var_columns = ['corrected_only_using_pool', 'mean_correction_quantreg_slope', 
                                       'n_cells','median_intensity', 'mean_intensity', 'sum_correction_using_ion_pool'],
                       use_raw = True)

In [None]:
ma_raw.pair_plot(exclude_ref_corrected = False)

In [None]:
sns.pairplot(ma_raw.impact_ions_filtered[['scores', 'mean_correction_quantreg_slope', 'n_cells']], 
                         hue='mean_correction_quantreg_slope',
                         vars=['scores', 'mean_correction_quantreg_slope', 'n_cells'],
                         height=2, diag_kind="kde", diag_kws={'hue': None, 'color':"grey"} 
                        )

In [None]:
ma_raw.volcano_plot(exclude_ref_corrected = False)

In [None]:
sc.pl.rank_genes_groups_tracksplot(ma_raw.conc_adata, groupby='correction', dendrogram=False)

In [None]:
adata_cor.obs['n_pixels'] = [i.count(';')+1 for i in adata_cor.obs['list_TPO']]
strat_cell_list = list(adata_cor.obs.groupby('n_pixels', group_keys=False).apply(lambda x: x.sample(1)).index)[:4]
strat_cell_list

The following plot shows the metabolic profiles of sampled cell generated from 1 and increasing numbers of ablation marks. The first row has the uncorrected raw ion intensities, the second row the corrected raw ion intensities and the third row the correction ratio/quotient between the two ( $correction\_ratio = \frac{I_{corrected}}{I_{uncorrected}}$ ). Most ions have the same correction ratio in a given cells, some have a higher ratio (smaller slope, less ISM correction) and some have a lower ratio (steeper slope, stronger ISM correction). The distribution of ion-specific correction ratios in the same cells is shown, separately for self-corrected and pool-corrected ions, in density plots underneath the metabolic profiles. Black horizontal and vertical lines show the sampling proportion of all ablation marks that were combined to the respective cell. Especially from the density plots, it becomes obvious that the majority of ions have correction ratios that colocalize with the pixels' sampling ratios. This can be explained by the fact that most ions had a correction slope of ~-1, the pool-corrected ions had all the same slope close to -1. Thus, these ions are down-corrected by multiplying with ~1 times their sampling proportion. Since many ions seem to occur only in one of the underlying ablation marks, the distribution of correction ratios has prominent peaks and few values between them.

In [None]:
ma_raw.quotient_plot(show_cells=strat_cell_list)

In [None]:
ma_raw.top_ion_plot()

In [None]:
_ = ma_raw.save_matrix(save_to_path = analysis_path, safe_to_name = project, save_figures=6)

Analogous to the differential expression analysis in this notebook, the raw ion intensities of the uncorrected and corrected dataset can be loaded into Metenrichr, an R package by Martijn Molenaar (corresponding shiny app by Bishoy Wadie), to perform DEA and subsequent enrichment analysis for lipid ontology terms.

For that, the following settings were used:

- Polarization mode: Negative
- DEA with wilcoxon test
- Number of bootstraps: 100
- Minimum number of annotations: 3
- adjusted p-value cutoff: 0.05

As the corrected dataset is compared to the uncorrected on, a positive enrichment corresponds to metabolites that have been increased/not changed in the correction and a negative enrichment to metabolites that have been decreased by ion suppression correction.

<p align="center">
  <img 
    style="display: block; 
           margin-left: auto;
           margin-right: auto;
           width: 70%;" align="center" width="800" src="metenrichr_LION_neg_wilcox_B100_A3_q0.05.png">
</p>

The same analysis is then carried out for the TIC-corrected and log-transformed data: Here, the differences between uncorrected and ISM-corrected data are much more subtle. This corresponds better with the UMAPs further down, as they also show very little noticebly differences between uncorrected and ISM-corrected datasets.

In [None]:
adata.X = adata.layers['log_norm_counts']
adata_cor.X = adata_cor.layers['log_norm_counts']

In [None]:
ma = MetaboliteAnalysis(adata=adata, adata_cor=adata_cor, condition_name = condition_name, 
                        obs_columns = ['list_TPO'],
                        var_columns = ['corrected_only_using_pool', 'mean_correction_quantreg_slope', 
                                       'n_cells','median_intensity', 'mean_intensity', 'sum_correction_using_ion_pool'],
                       use_raw = False)

In [None]:
sc.pl.rank_genes_groups_tracksplot(ma.conc_adata, groupby='correction', dendrogram=False)

After TIC normalization and log transformation, the distributions of correction ratios is moved and scaled down. Thus they do not co-localize with the pixels' sampling proportions anymore and differences between datasets are smaller than for the raw data.

In [None]:
ma.quotient_plot(show_cells=strat_cell_list)

In [None]:
ma.top_ion_plot()

The different degree of correction of metabolites could be reflected in the spatial distribution on the corresponding ion images. To this end, the ion images of each two metabolites with the highest (center column) and lowest (right column) correction slopes are shown in the following figure (from well H3, condition Naive_WT)). The corresponding dataset is available on [metaspace](https://metaspace2020.eu/annotations?db_id=658&grp=5727e852-e1dd-11e8-9d75-5fefa7059e48&ds=2022-07-16_08h56m51s&fdr=0.5&q=C24H48NO6P%20%7C%20C37H71O8P%20%7C%20C42H82NO8P%20%7C%20C41H80NO8P&row=4).

<p align="center">
  <img 
    style="display: block; 
           margin-left: auto;
           margin-right: auto;
           width: 70%;" align="center" width="800" src="ion_images_glioblastoma.png">
</p>


## Comparison of the datasets

In the following, the uncorrected and ISM-corrected datasets are compared using methods of a typical single-cell analysis. Unless specified otherwise, the data was preprocessed using TOC normalization and log transformation.

In [None]:
def dimred_pca(adata):
    sc.pp.pca(adata)
    sc.pl.pca_overview(adata, color=['well', condition_name], palette='cividis')

dimred_pca(adata)
dimred_pca(adata_cor)

In [None]:
dimred_umap(adata)
dimred_umap(adata_cor)

In [None]:
# from sklearn.metrics.cluster import completeness_score
# from sklearn.metrics import accuracy_score, silhouette_score
# 
# def kmeans_clust(adata):
#     n_clusters = len(adata.obs[condition_name].value_counts())
#     kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(adata.X)
#     adata.obs['kmeans'] = kmeans.labels_.astype(str)
# 
#     sc.tl.leiden(adata, resolution=2)
# 
#     leiden = np.array(adata.obs['leiden'].values)
#     leiden_curated = np.copy(leiden)
#     fc = np.array(adata.obs[condition_name].values)
#     for cluster in np.unique(leiden):
#         labels, counts = np.unique(fc[leiden == cluster], return_counts=True)
#         leiden_curated[leiden == cluster] = str(labels[counts == np.max(counts)][0])
#     adata.obs['leiden_curated'] = leiden_curated
# 
#     sc.pl.umap(adata, color=['kmeans', 'leiden', 'leiden_curated', condition_name], palette='cividis')
#     # print('Leiden acccuracy score: %1.4f' % accuracy_score(y_true = adata.obs[condition_name].replace(['HeLa', 'NIH3T3'], ['0', '1']), y_pred = adata.obs['leiden']))
#     print('Curated leiden acccuracy score: %1.4f' % accuracy_score(y_true = adata.obs[condition_name], y_pred = adata.obs['leiden_curated']))
#     print('KMeans completeness score: %1.4f' % completeness_score(adata.obs[condition_name], adata.obs['kmeans']))
#     print('KMeans silhouette coefficient: %1.4f' % silhouette_score(adata.X, adata.obs['kmeans']))
# 
# kmeans_clust(adata)
# kmeans_clust(adata_cor)
# 

In [None]:
summaries = intermixing({'uncorrected': adata, 'ISM correction': adata_cor}, condition_name = condition_name, measures = ['X_pca', 'X_umap'])

In [None]:
s = intermixing(
    adata_dict = {'uncorrected': adata, 'ISM correction': adata_cor},
    condition_name = condition_name,
    sample_frac=0.1,
    measures =['X_umap', 'X_pca'],
    n_datapoints = 50,
    sample_log = True,
    neighborhood_size = None,
    normalized = False,
    show_table = [],
    n_jobs = multiprocessing.cpu_count()
)

In [None]:
from scipy.integrate import trapz, simps

def auc_intermixing(summary_dict):
    for name, data in summary_dict.items():
        print('Area under the curve for %s: %1.4f'%(name, trapz(data['mean'], data.index) / max(data.index)))


auc_intermixing(s[1])

In [None]:
from sklearn.svm import LinearSVC

def analyse_svm_margin(adata, adata_cor, condition_name, layer=None):
    print(layer)
    if layer is not None:
        adata.layers['default_X'] = adata.X
        adata.X = adata.layers[layer]
        adata_cor.layers['default_X'] = adata_cor.X
        adata_cor.X = adata_cor.layers[layer]
    
    def get_svm_margin(adata, condition_name, size_factor = 1):
        predictors = adata.X * size_factor
        result = adata.obs[condition_name]
        clf = LinearSVC(random_state=0, dual=False)
        clf.fit(predictors, result)  
        margin_df = pd.DataFrame({'condition': clf.classes_, 'margin': 1 / np.sqrt(np.sum(clf.coef_**2, axis=1))})

        #print(margin_df)
        return margin_df
    
    size_factor = np.sum(adata.X) / np.sum(adata_cor.X)

    df = pd.merge(get_svm_margin(adata, condition_name), 
                  get_svm_margin(adata_cor, condition_name, size_factor = size_factor), 
                  on='condition', suffixes=['_uncorrected', '_ISM_correction'])
    sns.set(rc={"figure.figsize":(5, 5)})
    sns.set_style("white")
    plt = sns.barplot(df.melt(id_vars='condition', var_name='correction', value_name='margin'), 
                x='condition', y='margin', hue='correction', 
               )
    #plt.set_title('Comparison of SVM margins')
    plt.set_xticklabels(labels = plt.get_xticklabels(), rotation=45, horizontalalignment='right')
    
    
    if layer is not None:
        adata.X = adata.layers['default_X']
        adata_cor.X = adata_cor.layers['default_X']

    return plt
   

In [None]:
analyse_svm_margin(adata, adata_cor, condition_name, layer='1e4_log_norm_counts')

In [None]:
analyse_svm_margin(adata, adata_cor, condition_name, layer='log_norm_counts')