In [1]:
import scvi
import scanpy as sc
import anndata
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# read in data

In [2]:
aae_path = '/public21/home/sc90258/huangying/analysis/12.pan_mosquito/20250318_replot_mosq/aae/1.QC.h5ad'
alb_path = '/public21/home/sc90258/huangying/analysis/12.pan_mosquito/20250318_replot_mosq/alb/1.QC.h5ad'
cxtri_path = '/public21/home/sc90258/huangying/analysis/12.pan_mosquito/20250318_replot_mosq/cxtri/1.QC.h5ad'
cxpip_path = '/public21/home/sc90258/huangying/analysis/12.pan_mosquito/20250318_replot_mosq/cxpip/1.QC.h5ad'

In [3]:
aae = sc.read(aae_path)
aae.obs_names = [f'{i}-aae' for i in aae.obs_names]
alb = sc.read(alb_path)
alb.obs_names = [f'{i}-alb' for i in alb.obs_names]
cxtri = sc.read(cxtri_path)
cxtri.obs_names = [f'{i}-cxtri' for i in cxtri.obs_names]
cxpip = sc.read(cxpip_path)
cxpip.obs_names = [f'{i}-cxpip' for i in cxpip.obs_names]

In [4]:
homog = pd.read_csv('/public21/home/sc90258/huangying/analysis/12.pan_mosquito/20240830_homo_gene/mosquito_all_homog.csv')
homog

Unnamed: 0,aae,alb,cxpip,cxtri
0,LOC110673977,LOC109409628,LOC120416324,Ctri05375
1,LOC110673980,LOC115258594,LOC120427986,Ctri05761
2,LOC110673981,LOC109427304,LOC120415083,Ctri12482
3,LOC110673983,LOC109402949,LOC120425061,Ctri03408
4,LOC110673984,LOC134285458,LOC120428736,Ctri02287
...,...,...,...,...
8167,LOC5580326,LOC109420085,LOC120419188,Ctri05472
8168,LOC5580327,LOC109420086,LOC120419195,Ctri05473
8169,LOC5580329,LOC109420083,LOC120419194,Ctri05470
8170,LOC5580330,LOC109420082,LOC120422788,Ctri05866


In [5]:
def gname_to_aae(adata, sp):
    dct = dict(homog[[sp, 'aae']].values)
    var_names_sub = [i for i in adata.var_names if i in dct.keys()]
    homo_var_names = [dct[i] for i in adata.var_names if i in dct.keys()]
    adata_homo = adata[:, var_names_sub]
    adata_homo.var_names = homo_var_names

    return adata_homo

In [6]:
aae

AnnData object with n_obs × n_vars = 9239 × 19269
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [7]:
alb_homo = gname_to_aae(alb, 'alb')
alb_homo

AnnData object with n_obs × n_vars = 17538 × 8166
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [8]:
cxtri_homo = gname_to_aae(cxtri, 'cxtri')
cxtri_homo

AnnData object with n_obs × n_vars = 5548 × 8172
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [9]:
cxpip_homog = gname_to_aae(cxpip, 'cxpip')
cxpip_homog

AnnData object with n_obs × n_vars = 9844 × 8166
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

# merge adata

In [10]:
adata = anndata.concat([aae, alb_homo, cxtri_homo, cxpip_homog], join="inner", label="batch", keys=['aae', 'alb', 'cxtri', 'cxpip'])
adata

AnnData object with n_obs × n_vars = 42169 × 8161
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier', 'batch'

In [11]:
adata.obs_names

Index(['CGATCGGTA_AACGTCCAA_AACAAGTGG-aae',
       'CGCCAATGA_AACGTCCAA_AACAAGTGG-aae',
       'TGCCTGATC_AACGTCCAA_AACAAGTGG-aae',
       'GAGCAGCTT_ACACCAACG_AACAAGTGG-aae',
       'TCCGTATCA_ACACCAACG_AACAAGTGG-aae',
       'AACGGACCT_ACCAGGTCA_AACAAGTGG-aae',
       'GCCTACGAT_ACCGTACTC_AACAAGTGG-aae',
       'TAACCTACC_ACCGTACTC_AACAAGTGG-aae',
       'GCGAGTAAC_ACCTGCTAC_AACAAGTGG-aae',
       'TGAGCGAAG_AGACGAAGT_AACAAGTGG-aae',
       ...
       'TGTGGACACTTACACGACCACCAAGTC-cxpip',
       'TGTGGACACTTACACGACCGGAATGCT-cxpip',
       'TGTGGACACTTACACGACCGTCAACTG-cxpip',
       'TGTGGACACTTCGAGGATGCCACATTG-cxpip',
       'TGTGGACACTTGAGACAGAAGATCTGC-cxpip',
       'TGTGGACACTTGCCGTCACTCTAACAC-cxpip',
       'TGTGGACACTTGCCGTCAGAGTGATCT-cxpip',
       'TGTGGACACTTGCCGTCAGAGTTCGTC-cxpip',
       'TGTGGACACTTGGTGACCCCGTGTCAA-cxpip',
       'TGTGGACACTTGGTGACCGGAGAGGAA-cxpip'],
      dtype='object', length=42169)

# Preprocessing data (scale)

In [12]:
# keep the count information intact for scvi-tools models
adata.layers["counts"] = adata.X.copy()
# scale data
sc.pp.normalize_total(
    adata, 
    target_sum=1e6, 
    exclude_highly_expressed=True
)
sc.pp.log1p(adata)
# store the normalized values in .raw to keep them safe
adata.raw = adata

In [13]:
adata

AnnData object with n_obs × n_vars = 42169 × 8161
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'mt_outlier', 'genes_outlier', 'batch'
    uns: 'log1p'
    layers: 'counts'

In [14]:
adata.obs['batch']

CGATCGGTA_AACGTCCAA_AACAAGTGG-aae      aae
CGCCAATGA_AACGTCCAA_AACAAGTGG-aae      aae
TGCCTGATC_AACGTCCAA_AACAAGTGG-aae      aae
GAGCAGCTT_ACACCAACG_AACAAGTGG-aae      aae
TCCGTATCA_ACACCAACG_AACAAGTGG-aae      aae
                                     ...  
TGTGGACACTTGCCGTCACTCTAACAC-cxpip    cxpip
TGTGGACACTTGCCGTCAGAGTGATCT-cxpip    cxpip
TGTGGACACTTGCCGTCAGAGTTCGTC-cxpip    cxpip
TGTGGACACTTGGTGACCCCGTGTCAA-cxpip    cxpip
TGTGGACACTTGGTGACCGGAGAGGAA-cxpip    cxpip
Name: batch, Length: 42169, dtype: category
Categories (4, object): ['aae', 'alb', 'cxtri', 'cxpip']

In [15]:
adata.write('1.pre_merge.h5ad')