In [26]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

import scanpy as sc
import anndata as ad
import scvi


In [27]:
known_genes= pd.read_csv('../data/goterms/human_go.csv')['Gene name'].unique()
known_genes= known_genes[~pd.isna(known_genes)]

### pbmcs

In [3]:
pbmcs= scvi.data.pbmc_seurat_v4_cite_seq()

[34mINFO    [0m File data/pbmc_seurat_v4.h5ad already downloaded                                    


  res = method(*args, **kwargs)
  res = method(*args, **kwargs)
  res = method(*args, **kwargs)
  res = method(*args, **kwargs)


[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"orig.ident"[0m[1m][0m                                          
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.X                                                             
[34mINFO    [0m Using protein expression from adata.obsm[1m[[0m[32m'protein_counts'[0m[1m][0m                          
[34mINFO    [0m Using protein names from columns of adata.obsm[1m[[0m[32m'protein_counts'[0m[1m][0m                    
[34mINFO    [0m Successfully registered anndata object containing [1;36m152094[0m cells, [1;36m20729[0m vars, [1;36m24[0m      
         batches, [1;36m1[0m labels, and [1;36m217[0m proteins. Also registered [1;36m0[0m extra categorical covariates 
         and [1;36m0[0m extra continuous covariates.                                                  
[34mINFO    [0m Please do not further mo

In [23]:
pbmcs.obs= pbmcs.obs[['orig.ident','lane','donor','time','celltype.l1','celltype.l2','celltype.l3','Phase']]
pbmcs= pbmcs[:, list(set(pbmcs.var_names) & set(known_genes))]

query= set([('P%s'%i, np.random.choice(['0','3','7'])) for i in range(1,9)])
query_inds= np.array([(pbmcs.obs.donor[i], pbmcs.obs.time[i]) in query for i in range(pbmcs.shape[0])])
pbmcs.obs['batch']= ['%s_%s_%s'%(pbmcs.obs.donor[i], pbmcs.obs.time[i], pbmcs.obs.lane[i]) for i in range(pbmcs.shape[0])]

pbmcs_ref= pbmcs[~query_inds].copy()
pbmcs_query= pbmcs[query_inds].copy()

sc.pp.highly_variable_genes(
    pbmcs_ref,
    n_top_genes=2000,
    batch_key='batch',
    flavor='seurat_v3',
    subset=True
)

pbmcs_query = pbmcs_query[:, pbmcs_ref.var_names].copy()


Trying to set attribute `.obs` of view, copying.
  df.loc[: int(n_top_genes), 'highly_variable'] = True


In [31]:
pbmcs_ref.write_h5ad('../data/scRNAseq-expts/pbmc_ref.h5ad')
pbmcs_query.write_h5ad('../data/scRNAseq-expts/pbmc_query.h5ad')

... storing 'batch' as categorical
... storing 'batch' as categorical


### panc
https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scarches_scvi_tools.html

In [32]:
url = "https://figshare.com/ndownloader/files/24539828"
adata= sc.read("pancreas.h5ad", backup_url=url)
adata= adata[:, list(set(adata.var_names) & set(known_genes))]
adata.X= adata.X.round()
query= np.array([s in ["smartseq2", "celseq2"] for s in adata.obs.tech])

adata_ref = adata[~query].copy()
adata_query = adata[query].copy()

sc.pp.highly_variable_genes(
    adata_ref,
    n_top_genes=2000,
    batch_key="tech",
    flavor='seurat_v3',
    subset=True
)

adata_query = adata_query[:, adata_ref.var_names].copy()


  df.loc[: int(n_top_genes), 'highly_variable'] = True


In [35]:
adata_ref.write_h5ad('../data/scRNAseq-expts/panc_ref.h5ad')
adata_query.write_h5ad('../data/scRNAseq-expts/panc_query.h5ad')

### brain

In [4]:
path_dict= {'tenx': '../data/vizgen/Allen_10X_2020.gzip',
            'smrt': '../data/vizgen/Allen_SmartSeq_2020.gzip',
            'vzgn': '../data/vizgen/Vizgen_MERFISH.gzip'}
tenx= pd.read_parquet(path_dict['tenx']).reset_index()
smrt= pd.read_parquet(path_dict['smrt']).reset_index()
vzgn= pd.read_parquet(path_dict['vzgn']).reset_index()

In [5]:
index_cols= ['source', 'level_0', 'Level_3-subclass_label', 'Level_5-cluster_label', 'size']
shared_genes= sorted((set(tenx.columns) & set(vzgn.columns) & set(smrt.columns)) - set(index_cols))

In [17]:
tenx['source']= '10X'
smrt['source']= 'SmartSeq'

In [42]:
ref_full= ad.concat([ad.AnnData(csr_matrix(tenx[shared_genes].values), 
                                obs=tenx[index_cols].set_index('level_0'), 
                                var= pd.DataFrame(index=shared_genes)), 
                     ad.AnnData(csr_matrix(smrt[shared_genes].values), 
                                obs=smrt[index_cols].set_index('level_0'), 
                                var= pd.DataFrame(index=shared_genes))])


In [59]:
train_inds= []
valid_inds= []
for (src,l3),v in ref_full.obs.groupby(['source', 'Level_3-subclass_label']):
    for l5,v2 in v.groupby('Level_5-cluster_label'):
        inds= list(v2.index)
        train= np.random.choice(inds, int(len(inds)*.8), replace=False)
        train_inds.extend(train)
        valid_inds.extend(set(inds)-set(train))


In [154]:
brain_merfish= ad.AnnData(csr_matrix(vzgn[shared_genes].values),
                          obs=vzgn.iloc[:,:11],
                          var= pd.DataFrame(index=shared_genes))



In [156]:
ref_full[train_inds].write_h5ad('../data/scRNAseq-expts/brain_ref.h5ad')
brain_query.write_h5ad('../data/scRNAseq-expts/brain_query.h5ad')
brain_merfish.write_h5ad('../data/scRNAseq-expts/brain_merfish.h5ad')

... storing 'source' as categorical
... storing 'Level_3-subclass_label' as categorical
... storing 'Level_5-cluster_label' as categorical
... storing 'source' as categorical
... storing 'Level_3-subclass_label' as categorical
... storing 'Level_5-cluster_label' as categorical
... storing 'slice' as categorical
... storing 'replicate' as categorical
