# Results analysis

In [None]:
import os
import json
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import outer_spacem as osm
import sys
sys.path.append('/home/mklein/FDA_project')
from src.ion_suppression import ISC
from src import const
from src import evaluation as ev


%matplotlib inline
# Default parameters

In [None]:
isc = ISC(config['runtime']['spacem_dataset_path'], config)

## Dimensionality reduction

In [None]:
adata = ad.read(os.path.join(config['runtime']['out_folder'], config['output']['file_names']['generated_adata']))
adata_cor = ad.read(os.path.join(config['runtime']['out_folder'], config['output']['file_names']['corrected_adata']))

metadata = isc.get_spacem_metadata()

def assign_conditions(adata):
    index = adata.obs.index.name
    
    new_obs = adata.obs.reset_index()

    new_obs = pd.merge(new_obs, metadata[[const.SAMPLE_COL, const.POPULATION_COL]],
                       how='inner').set_index(index)

    adata = adata[new_obs.index, :].copy()
    adata.obs = new_obs
    return adata

adata = assign_conditions(adata)
adata_cor = assign_conditions(adata_cor)

In [None]:
pd.crosstab(adata.obs[const.SAMPLE_COL], adata.obs[const.POPULATION_COL], margins=True)

In [None]:
included_molecules = adata.var_names.intersection(adata_cor.var_names)
included_cells = adata.obs_names.intersection(adata_cor.obs_names)

def subset_molecules(adata):

    return adata[included_cells, included_molecules].copy()

adata_shape = adata.shape
adata_cor_shape = adata_cor.shape

adata = subset_molecules(adata)
adata_cor = subset_molecules(adata_cor)

print('uncorrected set: %s -> %s'% (adata_shape, adata.shape))
print('corrected set: %s -> %s'% (adata_cor_shape, adata_cor.shape))

First of all, the loaded datasets are filtered:

- cells need non-zero intensities for at least 10 ions.
- ions need non-zero intensities for at least 200 cells.

After that, the sets are preprocessed in different ways:

- intensties are normalized to TIC and/or log-transformed (log(x+1))

After that, both datasets are subset to contain the same ions and cells (intersection).

In [None]:
def preprocess(adata):
    
    sc.pp.filter_cells(adata, min_genes=10)
    sc.pp.filter_genes(adata, min_cells=200)
    adata.raw = adata
    adata.layers["raw_counts"] = adata.X.copy()
    # sc.pp.scale(adata)
    adata.layers["norm_counts"] = sc.pp.normalize_total(adata, layer='raw_counts', target_sum=None, inplace=False)['X']
    adata.layers["1e4_norm_counts"] = sc.pp.normalize_total(adata, layer='raw_counts', target_sum=1e4, inplace=False)['X']
    adata.layers["log_norm_counts"] = sc.pp.log1p(adata.layers["norm_counts"], copy=True)
    adata.layers["1e4_log_norm_counts"] = sc.pp.log1p(adata.layers["1e4_norm_counts"], copy=True)
    adata.X = adata.layers["1e4_log_norm_counts"]
    
    adata.var['median_intensity'] = np.median(adata.X, axis=0)
    adata.var['mean_intensity'] = np.mean(adata.X, axis=0)
    # adata_x = adata.X.copy()
    # adata_x[adata_x == 0] = np.nan
    # adata.var['median_intensity_nonzero'] = np.nanmedian(adata_x, axis=0)
    
    
preprocess(adata)
preprocess(adata_cor)

print(adata.shape)
print(adata_cor.shape)


In [None]:
def dimred_umap(adata, layer=None, min_dist=0.5):
    if layer is not None:
        adata.layers['default_X'] = adata.X
        adata.X = adata.layers[layer]
    
    sc.pp.pca(adata)
    sc.pp.neighbors(adata, n_neighbors=50, metric='cosine')
    sc.tl.umap(adata, min_dist=min_dist, spread=1.0, random_state=1, n_components=2)
    sc.pl.umap(adata, color=[const.SAMPLE_COL, const.POPULATION_COL], palette='cividis')
    f = osm.pl.highlight_scatterplot(
        data = adata,
        obsm_key = "X_umap",
        hue = const.POPULATION_COL,
        col = const.POPULATION_COL,
        palette = "cividis",
        trim_axes=True,
        height = 5,
        scatter_kwargs = dict(s=5)
    )

    f.add_legend(markerscale=3)
    
    if layer is not None:
        adata.X = adata.layers['default_X']

In [None]:
def intermixing_layer(adata, adata_cor, condition_name, measures = ['X_pca', 'X_umap'], layer=None):
    if layer is not None:
        adata.layers['default_X'] = adata.X
        adata.X = adata.layers[layer]
        adata_cor.layers['default_X'] = adata_cor.X
        adata_cor.X = adata_cor.layers[layer]
    
    summaries = ev.intermixing({'uncorrected': adata, 'ISM correction': adata_cor}, condition_name = const.POPULATION_COL, measures = measures)
    
    if layer is not None:
        adata.X = adata.layers['default_X']
        adata_cor.X = adata_cor.layers['default_X']
    
    return
    

The different options for scaling and transforming the data are shown in the following:

1. TIC normalization and log transformation
2. normalization to a fixed count (10^4) and log transformation

The normalization to a fixed count has a slightly different effect than TIC normalization. The former normalizes all counts per cell to the given target sum so that all cells from the uncorrected and the corrected set are scaled to this count. In contrast, the latter retains the differences between the datasets by normalizing to the median count across cells in a dataset. Across modes of normalization, the corrected dataset shows a better separation visually and using the intermixing metric. However, this effect is very subtle after the data was log-transformed.

### Uncorrected dataset, normalized to TIC

In [None]:
dimred_umap(adata, layer='log_norm_counts')

### Corrected dataset, normalized to TIC

In [None]:
dimred_umap(adata_cor, layer='log_norm_counts')

### Performance metrics, normalized to TIC

In [None]:
intermixing_layer(adata, adata_cor, const.POPULATION_COL, measures = ['X_pca', 'X_umap'], layer='log_norm_counts')
ev.analyse_svm_margin(adata, adata_cor, condition_name, layer='log_norm_counts')

### Uncorrected dataset, normalized to a fixed count

In [None]:
dimred_umap(adata, layer='1e4_log_norm_counts')

### Corrected dataset, normalized to a fixed count

In [None]:
dimred_umap(adata_cor, layer='1e4_log_norm_counts')

### Performance metrics, normalized to a fixed count

In [None]:
intermixing_layer(adata, adata_cor, const.POPULATION_COL, measures = ['X_pca', 'X_umap'], layer='1e4_log_norm_counts')
analyse_svm_margin(adata, adata_cor, condition_name, layer='1e4_log_norm_counts')