In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os

import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import adjusted_mutual_info_score as AMI

from scipy import sparse
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (12,12)
import seaborn as sns
from scipy import sparse

sc.settings.verbosity =0

In [5]:
adata = sc.read('../MODALITY_INTEGRATION/objects/Integration_2ndQC_metadata.h5ad', compression='gzip')

In [6]:
adata

AnnData object with n_obs × n_vars = 179290 × 38224
    obs: 'sample', 'batch', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Deepscore_external', 'Doublet_Detection_doublets_scRNA', 'Doublet_Detection_doublets_score_scRNA', 'Doublet_Detection_doublets_scRNA5p', 'Doublet_Detection_doublets_score_scRNA5p', 'AMULET_doublets', 'AMULET_pval', 'AMULET_qval', 'Doublet_Detection_doublets_snRNA', 'Doublet_Detection_doublets_score_snRNA'
    uns: 'batch_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [7]:
mod = 'scRNA5p'

In [10]:
adata.X = adata.layers['counts'].copy()

In [11]:
adata.obs['batch'].value_counts()

scRNA      89047
snRNA      78978
scRNA5p    11265
Name: batch, dtype: int64

In [12]:
adata = adata[adata.obs['batch'].isin([mod])].copy()

In [13]:
adata.obs

Unnamed: 0,sample,batch,n_counts,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_rb,pct_counts_rb,Deepscore_external,Doublet_Detection_doublets_scRNA,Doublet_Detection_doublets_score_scRNA,Doublet_Detection_doublets_scRNA5p,Doublet_Detection_doublets_score_scRNA5p,AMULET_doublets,AMULET_pval,AMULET_qval,Doublet_Detection_doublets_snRNA,Doublet_Detection_doublets_score_snRNA
AAACCTGGTTTGTTGG-1-0-scRNA5p,lib_36,scRNA5p,857.0,548,857.0,113.0,13.185531,0.0,0.000000,Unclassified,0.0,0.469203,0.0,0.469203,False,,,,
AAAGATGGTCTAGGTT-1-0-scRNA5p,lib_36,scRNA5p,843.0,456,843.0,265.0,31.435350,1.0,0.118624,Unclassified,0.0,5.593929,0.0,5.593929,False,,,,
AACTCTTTCAGTACGT-1-0-scRNA5p,lib_36,scRNA5p,927.0,481,927.0,307.0,33.117584,1.0,0.107875,Unclassified,0.0,0.366766,0.0,0.366766,False,,,,
ACCTTTAGTCTCTCTG-1-0-scRNA5p,lib_36,scRNA5p,4606.0,1992,4606.0,713.0,15.479809,13.0,0.282241,Unclassified,0.0,5.606707,0.0,5.606707,False,,,,
ACGGGCTCATGGTTGT-1-0-scRNA5p,lib_36,scRNA5p,1495.0,594,1495.0,742.0,49.632107,1.0,0.066890,Unclassified,0.0,5.539242,0.0,5.539242,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTCCTCTCGGAAACG-1-3-scRNA5p,lib_09,scRNA5p,14882.0,3069,14882.0,5019.0,33.725307,25.0,0.167988,PT,0.0,0.947069,0.0,0.947069,False,,,,
TTTGCGCAGTGGCACA-1-3-scRNA5p,lib_09,scRNA5p,1799.0,599,1799.0,807.0,44.858257,3.0,0.166759,PT,0.0,0.203022,0.0,0.203022,False,,,,
TTTGGTTCAATCCGAT-1-3-scRNA5p,lib_09,scRNA5p,1563.0,773,1563.0,68.0,4.350608,2.0,0.127959,PT,0.0,0.088040,0.0,0.088040,False,,,,
TTTGGTTTCAAAGTAG-1-3-scRNA5p,lib_09,scRNA5p,1460.0,698,1460.0,171.0,11.712329,1.0,0.068493,PT,0.0,0.587635,0.0,0.587635,False,,,,


In [14]:
obs = pd.read_csv('../MODALITY_INTEGRATION/Multi_obs.csv', index_col=0)

  obs = pd.read_csv('../MODALITY_INTEGRATION/Multi_obs.csv', index_col=0)


In [15]:
obs['batch'].unique()

array(['snRNA', 'scRNA', 'scRNA5p'], dtype=object)

In [16]:
obs = obs[obs['batch'].isin([mod])]
obs.index = [i.split('_')[0] for i in obs.index]

In [17]:
adata.obs['In_final_obj'] = [i in obs.index for i in adata.obs.index]
adata.obs['In_final_obj'].value_counts()

True     8917
False    2348
Name: In_final_obj, dtype: int64

In [18]:
adata.obs['In_final_obj'] = adata.obs['In_final_obj'].astype('category')

In [19]:
obs = obs[obs['modality'].isin(['expression','paired'])]

In [20]:
adata.obs['Deepscore'] = obs['Imputed_Deepscore_external']

In [21]:
adata.obs['Deepscore']

AAACCTGGTTTGTTGG-1-0-scRNA5p    NaN
AAAGATGGTCTAGGTT-1-0-scRNA5p    NaN
AACTCTTTCAGTACGT-1-0-scRNA5p    NaN
ACCTTTAGTCTCTCTG-1-0-scRNA5p    NaN
ACGGGCTCATGGTTGT-1-0-scRNA5p    NaN
                               ... 
TTTCCTCTCGGAAACG-1-3-scRNA5p     PT
TTTGCGCAGTGGCACA-1-3-scRNA5p     PT
TTTGGTTCAATCCGAT-1-3-scRNA5p     PT
TTTGGTTTCAAAGTAG-1-3-scRNA5p     PT
TTTGTCAGTCACAAGG-1-3-scRNA5p     PT
Name: Deepscore, Length: 11265, dtype: object

In [22]:
adata = adata[adata.obs['In_final_obj'].astype(bool)].copy()

## Following the same approach as in scRNA-seq 3'

In [24]:
filter_mask = (adata.obs['n_genes_by_counts'] < 500) & ~adata.obs['Deepscore_external'].isin(['LEUK'])
np.sum(filter_mask)

1104

In [25]:
adata = adata[~filter_mask].copy()
adata

AnnData object with n_obs × n_vars = 7813 × 38224
    obs: 'sample', 'batch', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'Deepscore_external', 'Doublet_Detection_doublets_scRNA', 'Doublet_Detection_doublets_score_scRNA', 'Doublet_Detection_doublets_scRNA5p', 'Doublet_Detection_doublets_score_scRNA5p', 'AMULET_doublets', 'AMULET_pval', 'AMULET_qval', 'Doublet_Detection_doublets_snRNA', 'Doublet_Detection_doublets_score_snRNA', 'In_final_obj', 'Deepscore'
    uns: 'batch_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [26]:
adata.obs['sample'].value_counts()

lib_36    2383
lib_10    2376
lib_38    2163
lib_09     891
Name: sample, dtype: int64

In [27]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor = 'seurat_v3', n_bins=1)

In [30]:
adata.write(f'objects/{mod}_raw.h5ad', compression='gzip')