In [11]:
import scanpy as sc
import pandas as pd

# get marker genes

In [3]:
adata = sc.read('./4.ann.h5ad')
adata

AnnData object with n_obs × n_vars = 40350 × 4000
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier', 'batch', 'leiden_res', 'cluster', 'cluster_name'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'hvg', 'leiden_res', 'leiden_res_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_pca_raw', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [6]:
list(adata.obs['cluster_name'].unique())

['ISC/EB-prol',
 'EC-like-2',
 'EC',
 'ISC/EB',
 'Cardia-2',
 'EE-3',
 'Cardia-1',
 'EE-1',
 'EC-like-1',
 'EE-2',
 'EC-like-3',
 '26',
 'HC',
 'VM',
 'FBC',
 '25']

In [7]:
sc.tl.rank_genes_groups(
    adata, 
    "cluster_name", 
    method="wilcoxon", 
    pts=True, 
    key_added='rank_genes_groups.wilcoxon'
)

In [8]:
sc.tl.filter_rank_genes_groups(
    adata,
    key="rank_genes_groups.wilcoxon",
    key_added="rank_genes_groups.wilcoxon.filter",
)

In [9]:
adata.uns['rank_genes_groups.wilcoxon']

{'params': {'groupby': 'cluster_name',
  'reference': 'rest',
  'method': 'wilcoxon',
  'use_raw': True,
  'layer': None,
  'corr_method': 'benjamini-hochberg'},
 'pts':                 ISC/EB  ISC/EB-prol  Cardia-1  Cardia-2        EC  EC-like-1  \
 LOC110674347  0.002252     0.001762  0.003091  0.006073  0.017579   0.010849   
 LOC110674348  0.000000     0.000160  0.000000  0.000000  0.000088   0.000000   
 LOC5575839    0.051158     0.018421  0.018547  0.024107  0.061616   0.032113   
 LOC5575838    0.000322     0.000160  0.000000  0.000184  0.000439   0.000434   
 LOC5575837    0.201416     0.090982  0.120556  0.162311  0.439044   0.231882   
 ...                ...          ...       ...       ...       ...        ...   
 CFI06_mgt20   0.001931     0.000160  0.003091  0.002760  0.012393   0.008245   
 CFI06_mgr01   0.614865     0.388916  0.608964  0.649245  0.896106   0.839867   
 CFI06_mgt21   0.000000     0.000000  0.000000  0.000000  0.000000   0.000000   
 CFI06_mgt22   0.0000

In [12]:
name = pd.DataFrame(adata.uns['rank_genes_groups.wilcoxon.filter']['names'])
padj = pd.DataFrame(adata.uns['rank_genes_groups.wilcoxon.filter']['pvals_adj'])
lgFC = pd.DataFrame(adata.uns['rank_genes_groups.wilcoxon.filter']['logfoldchanges'])
scores = pd.DataFrame(adata.uns['rank_genes_groups.wilcoxon']['scores'])
pts = adata.uns['rank_genes_groups.wilcoxon.filter']['pts']
pts_rest = adata.uns['rank_genes_groups.wilcoxon.filter']['pts_rest']

dfs = []
for cluster in name.columns:
    df = pd.concat(
        [
            name[cluster].dropna(), 
            padj[cluster].dropna(), 
            lgFC[cluster].dropna(),
            scores[cluster].dropna(),
        ], 
        axis=1
    )
    df.columns = ['names', 'pvals_adj', 'logfoldchanges', 'scores']
    df.set_index(['names'], inplace=True)

    pts_df = pd.concat([pts[[cluster]], pts_rest[[cluster]]], axis=1)
    pts_df.columns = ['pts', 'pts_rest']

    df = df.merge(pts_df, left_index=True, right_index=True, how='inner')
    df['leiden_res'] = cluster

    dfs.append(df)

markg = pd.concat(dfs, axis=0)
markg = markg.sort_values(['leiden_res', 'scores'], ascending=False)

markg

Unnamed: 0_level_0,pvals_adj,logfoldchanges,scores,pts,pts_rest,leiden_res
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LOC5580173,1.005996e-125,13.764691,24.259724,0.811258,0.006817,VM
LOC5573031,1.106460e-120,13.413625,23.748140,0.794702,0.008465,VM
LOC5580231,5.354777e-119,13.659195,23.567343,0.788079,0.006417,VM
LOC5576127,6.167170e-97,13.081758,21.294367,0.711921,0.003970,VM
LOC5568444,2.881767e-94,12.833956,20.993767,0.701987,0.004619,VM
...,...,...,...,...,...,...
LOC5573103,1.263470e-02,2.202772,3.612218,0.256545,0.133445,25
LOC5568355,5.356542e-02,1.280174,3.126205,0.450262,0.358575,25
LOC5567637,8.355446e-02,1.249147,2.962382,0.335079,0.273139,25
LOC5566119,9.247537e-02,1.141814,2.922727,0.345550,0.289026,25


In [13]:
adata

AnnData object with n_obs × n_vars = 40350 × 4000
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier', 'batch', 'leiden_res', 'cluster', 'cluster_name'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'hvg', 'leiden_res', 'leiden_res_colors', 'log1p', 'neighbors', 'pca', 'umap', 'rank_genes_groups.wilcoxon', 'rank_genes_groups.wilcoxon.filter'
    obsm: 'X_pca', 'X_pca_harmony', 'X_pca_raw', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [15]:
markg.to_csv('./5.marker_genes.csv')