## Notebook to merge the multiVI clustering info onto full anndata object
This is to create an anndata object where all features can be inspected relative to clustering for manually curation of cell-type labeling. The multiVI anndata files only contain the top 20% of high variance features

In [None]:
!date

#### import libraries

In [None]:
import scanpy as sc
from anndata import AnnData
import numpy as np
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'
sc.settings.figdir = f'{figures_dir}/'

# in files
raw_anndata_file = f'{quants_dir}/{project}.raw.h5ad'
multivi_anndata_file = f'{quants_dir}/{project}.multivi.cellassign.h5ad'

# out files
new_full_anndata_file = f'{quants_dir}/{project}.full.h5ad'

# variables
DEBUG = False

### load data

#### load the multiVI and CellAssign anndata file

In [None]:
%%time
adata_multivi = sc.read_h5ad(multivi_anndata_file)
print(adata_multivi)
if DEBUG:
    display(adata_multivi.obs.head())

#### load to raw anndata file

In [None]:
%%time
adata_raw = sc.read_h5ad(raw_anndata_file)
print(adata_raw)
if DEBUG:
    display(adata_raw.obs.head())

#### make sure we are using same cells 
if there are additional cells from reference drop those

In [None]:
adata_multivi = adata_multivi[adata_multivi.obs.index.isin(adata_raw.obs.index)]
print(adata_multivi)
if DEBUG:
    display(adata_multivi.obs.head())

### identify some of the attributes that are appropriate to copy over

#### differences in the obs

In [None]:
# difference in obs columns
diff_cols = set(adata_multivi.obs.columns) ^ set(adata_raw.obs.columns)
print(f'different: {diff_cols}')
# share obs columns
shared_cols = set(adata_multivi.obs.columns) & set(adata_raw.obs.columns)
print(f'shared: {shared_cols}')

#### need to transfer these for obs
'RefCluster', 'Cell_type', 'sctypes', 'leiden_MultiVI', 'bakken', 'pangloadb'

#### differences in the var
expect many here that aren't appropriate to transfer as they are computed

In [None]:
# difference in obs columns
diff_cols = set(adata_multivi.var.columns) ^ set(adata_raw.var.columns)
print(f'different: {diff_cols}')
# share obs columns
shared_cols = set(adata_multivi.var.columns) & set(adata_raw.var.columns)
print(f'shared: {shared_cols}')

#### don't need to transfer any of the difference in the var columns

#### don't need to transfer any of the layers

#### check the following multi-diminsional and graph base observation annotations (obsm & obsp) as well as the unstructured data (uns) annotations

- uns: 'leiden', 'leiden_MultiVI_colors', 'neighbors', 'phase1_celltype_colors', 'phase1_cluster_colors', 'sample_id_colors', 'umap'
- obsm: 'MultiVI_latent', 'X_umap'
- obsp: 'connectivities', 'distances'

In [None]:
from numpy import ndarray
uns_items = ['leiden', 'leiden_MultiVI_colors', 'neighbors', 'phase1_celltype_colors', 
             'phase1_cluster_colors', 'sample_id_colors', 'umap']
obsm_items = ['MultiVI_latent', 'X_umap']
obsp_items = ['connectivities', 'distances']

for item_name in uns_items:
    print(item_name, type(adata_multivi.uns[item_name]), adata_multivi.uns[item_name])
    if type(adata_multivi.uns[item_name]) is ndarray:
        print(adata_multivi.uns[item_name].shape)
    
for item_name in obsm_items:
    print(item_name, type(adata_multivi.obsm[item_name]))
    if type(adata_multivi.obsm[item_name]) is ndarray:
        print(adata_multivi.obsm[item_name].shape)    
    
for item_name in obsp_items:
    print(item_name, type(adata_multivi.obsp[item_name]))   
    if type(adata_multivi.obsp[item_name]) is ndarray:
        print(adata_multivi.obsp[item_name].shape)    

### transfer the following attributes from the mutliVI object to the raw object
1. obs : ['RefCluster', 'Cell_type', 'sctypes', 'leiden_MultiVI', 'bakken', 'pangloadb']
2. uns: ['leiden', 'leiden_MultiVI_colors', 'neighbors', 'phase1_celltype_colors', 'phase1_cluster_colors', 'sample_id_colors', 'umap']
3. obsm : ['MultiVI_latent', 'X_umap']
4. obsp : ['connectivities', 'distances']

prior to transferring need to filter out to small number of cells that appear to have been filtered during multiVI analysis: 232388 (raw) - 232256 (multivi) = 132 (filtered)

#### filter to the same cell observations 

In [None]:
print(f'obs indices equal: {adata_raw.obs.index.equals(adata_multivi.obs.index)}')
adata_raw = adata_raw[adata_raw.obs.index.isin(adata_multivi.obs.index)]
print(adata_raw)
print(f'post filter, obs indices equal: {adata_raw.obs.index.equals(adata_multivi.obs.index)}')
if DEBUG:
    display(adata_raw.obs.head())

#### transfer the specified obs attributes

In [None]:
if adata_raw.obs.index.equals(adata_multivi.obs.index):
    for obs_col in ['RefCluster', 'Cell_type', 'leiden_MultiVI', 'sctypes', 'bakken', 'pangloadb']:
        adata_raw.obs[obs_col] = adata_multivi.obs[obs_col]
print(adata_raw)
if DEBUG:
    display(adata_raw.obs.sample(10))

#### transfer the uns attributes

In [None]:
for uns_item in ['leiden', 'leiden_MultiVI_colors', 'neighbors', 
                 'phase1_celltype_colors', 'phase1_cluster_colors', 
                 'sample_id_colors', 'umap']:
    adata_raw.uns[uns_item] = adata_multivi.uns[uns_item]
print(adata_raw)

#### transfer the obsm attributes

In [None]:
for obsm_item in ['MultiVI_latent', 'X_umap']:
    adata_raw.obsm[obsm_item] = adata_multivi.obsm[obsm_item]
print(adata_raw)

#### transfer the obsp attributes

In [None]:
for obsp_item in ['connectivities', 'distances']:
    adata_raw.obsp[obsp_item] = adata_multivi.obsp[obsp_item]
print(adata_raw)

### typical data prep stuff

In [None]:
%%time
sc.pp.filter_genes(adata_raw, min_counts=3)
adata_raw.layers['counts'] = adata_raw.X.copy()  # preserve counts
sc.pp.normalize_total(adata_raw, target_sum=1e4)
sc.pp.log1p(adata_raw)
# adata_raw.raw = adata_raw  # freeze the state in `.raw`
sc.pp.highly_variable_genes(adata_raw, n_top_genes=2000, subset=False, layer='counts', 
                            flavor='seurat_v3')
print(adata_raw)
if DEBUG:
    display(adata_raw.obs.sample(10))
    display(adata_raw.var.sample(10))    

### save the new full anndata object

In [None]:
%%time
adata_raw.write(new_full_anndata_file)

### try some of the typical visualizations

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-bright')
    sc.pl.umap(adata_raw, color=['leiden_MultiVI'], 
               frameon=False, legend_loc='on data')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-talk')
    sc.pl.umap(adata_raw, color=['age'], 
               frameon=False)

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-talk')
    sc.pl.umap(adata_raw, color=['Cell_type'], 
               frameon=False, legend_loc='on data')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
    plt.style.use('seaborn-talk')
    sc.pl.umap(adata_raw, color=['RefCluster'], 
               frameon=False, legend_loc='on data')

In [None]:
def plot_gene_in_umap(adata: AnnData, gene: str):
    if gene in adata.var.index:
        with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
            plt.style.use('seaborn-talk')
            sc.pl.umap(adata, color=gene)
    else:
        print(f'{gene} not present')

- neuron, SNAP25
- GABAerigc, GAD1
- Glutamatergic, GRIN1
- Microglia, CSF1R
- Astrocyte, GFAP
- Oligodendrocyte, PLP1

In [None]:
plot_gene_in_umap(adata_raw, 'SNAP25')
plot_gene_in_umap(adata_raw, 'GAD1')
plot_gene_in_umap(adata_raw, 'GRIN1')
plot_gene_in_umap(adata_raw, 'CSF1R')
plot_gene_in_umap(adata_raw, 'GFAP')
plot_gene_in_umap(adata_raw, 'PLP1')
plot_gene_in_umap(adata_raw, 'LRRK2')
plot_gene_in_umap(adata_raw, 'SNCA')

In [None]:
markers = ['SNAP25', 'GAD1', 'GRIN1', 'CSF1R', 'GFAP', 'PLP1', 'LRRK2', 'SNCA']
with rc_context({'figure.figsize': (12, 12), 'figure.dpi': 200}):
    plt.style.use('seaborn-talk')
    sc.pl.dotplot(adata_raw, markers, groupby='leiden_MultiVI', 
                  color_map='Blues', swap_axes=True, use_raw=False,
                  standard_scale='var')

In [None]:
with rc_context({'figure.figsize': (12, 12), 'figure.dpi': 200}):
    plt.style.use('seaborn-talk')
    sc.pl.dotplot(adata_raw, markers, groupby='Cell_type', 
                  color_map='Blues', swap_axes=True, use_raw=False,
                  standard_scale='var')

In [None]:
!date