In [1]:
import pandas as pd
import scanpy as sc

from MarkerCount.hicat import HiCAT, show_summary 
from MarkerCount.hicat import plot_marker_expression_profile

https://github.com/combio-dku


### Load scRNA data from CCA

In [2]:
add = 'data/'
adata = sc.read(add+'CCA_Lung.h5ad')
adata

AnnData object with n_obs × n_vars = 482351 × 36601
    obs: 'n_counts', 'n_genes', 'mito', 'Dataset', 'Organ_orig', 'Organ', 'Sample', 'Patient', 'Subtype', 'Tissue_site', 'Tissue', 'CancerType', 'DataOrgTis', 'PtOrgTis', 'SpOrgTis', 'leiden', 'cnv_status', 'cnv_leiden', 'cnv_score', 'cell_type_major'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'chromosome', 'start', 'end', 'gene_id', 'gene_name'
    uns: 'anno_2212_colors', 'anno_cat_colors', 'cnv', 'cnv_leiden_colors', 'cnv_neighbors', 'cnv_status_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

In [3]:
adata_endo = adata[adata.obs['cell_type_major'] == 'Endothelial'].copy()
adata_endo

AnnData object with n_obs × n_vars = 9990 × 36601
    obs: 'n_counts', 'n_genes', 'mito', 'Dataset', 'Organ_orig', 'Organ', 'Sample', 'Patient', 'Subtype', 'Tissue_site', 'Tissue', 'CancerType', 'DataOrgTis', 'PtOrgTis', 'SpOrgTis', 'leiden', 'cnv_status', 'cnv_leiden', 'cnv_score', 'cell_type_major'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'chromosome', 'start', 'end', 'gene_id', 'gene_name'
    uns: 'anno_2212_colors', 'anno_cat_colors', 'cnv', 'cnv_leiden_colors', 'cnv_neighbors', 'cnv_status_colors', 'hvg', 'leiden', 'neighbors', 'pca', 'umap'
    obsm: 'X_cnv', 'X_cnv_pca', 'X_cnv_umap', 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'cnv_neighbors_connectivities', 'cnv_neighbors_distances', 'connectivities', 'distances'

In [4]:
X_endo = adata_endo.to_df()

## Marker file to use
mkr_file = 'DB/cell_markers_EC.tsv'

target_tissues = [] # ['Immune', 'General']
target_cell_types = ['Endothelial cell'] 
to_exclude = [] 

df_pred, summary = \
    HiCAT( X_endo, mkr_file, log_transformed = False,
           target_tissues = target_tissues, target_cell_types = target_cell_types, 
               minor_types_to_exclude = to_exclude, mkr_selector = '100000', 
               N_neighbors_minor = 31, N_neighbors_subset = 1,  
               Clustering_algo = 'lv', Clustering_resolution = 1, 
               Clustering_base = 'pca', N_pca_components = 15, 
               cbc_cutoff = 0.01, Target_FPR = 0.05, pval_th = 0.05,
               cycling_cell = False, copy_X = False, verbose = False )


In [5]:
df_pred.cell_type_subset.value_counts()

In [6]:
types_df = df_pred[['cell_type_subset']]
types_df.index = adata_endo.obs.index
adata_endo.obs.loc[types_df.index, ['cell_type_subset']] = types_df[['cell_type_subset']]

In [7]:
adata.obs['cell_type_subset'] = adata.obs['cell_type_major']
adata.obs['cell_type_major'] = adata.obs['cell_type_major'].astype(str)
adata_endo.obs['cell_type_subset'] = adata_endo.obs['cell_type_subset'].astype(str)

adata.obs.loc[adata.obs['cell_type_subset'] == 'Endothelial', 'cell_type_subset'] = adata_endo.obs['cell_type_subset'].values

In [8]:
adata.obs['cell_type_subset'].value_counts()

In [9]:
# Save
path = 'data/'
adata.write(path + 'CCA_Lung_subset.h5ad')