In [1]:
import numpy as np
import scanpy as sc
import scipy
from scipy.sparse import csr_matrix
import scanpy.external as sce
import pandas as pd
import matplotlib.pyplot as plt
from composition_stats import clr
import anndata as ad
from muon import prot as pt

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
from anndata import AnnData

hashtag_to_thaw = {
    "TotalSeq-C0251 anti-human Hashtag 1 Antibody" : "M-A3-Unsti",
    "TotalSeq-C0255 anti-human Hashtag 5 Antibody" : "U-A3-Unsti", 
    "TotalSeq-C0259 anti-human Hashtag 9 Antibody" : "Z-A3-Unsti",  
    "TotalSeq-C0253 anti-human Hashtag 3 Antibody" : "M-A3-6th",    
    "TotalSeq-C0254 anti-human Hashtag 4 Antibody" : "M-A3-9th",  
    "TotalSeq-C0257 anti-human Hashtag 7 Antibody" : "U-A3-7th",
    "TotalSeq-C0260 anti-human Hashtag 10 Antibody": "Z-A3-1st",
    "TotalSeq-C0252 anti-human Hashtag 2 Antibody" : "M-A3-3rd",
    "TotalSeq-C0256 anti-human Hashtag 6 Antibody" : "U-A3-4th",
    "TotalSeq-C0258 anti-human Hashtag 8 Antibody" : "U-A3-10th",
    "TotalSeq-C0262 anti-human Hashtag 12 Antibody": "Z-A3-5th",
    "TotalSeq-C0263 anti-human Hashtag 13 Antibody": "Z-A3-9th"
}

def over_input(dataframe, hashtag_names): #Hashtag names must be a list
    df_hash = pd.DataFrame(columns=dataframe.var_names, index=dataframe.obs.index)
    res = np.array(dataframe.X.todense()) # this step is very wasteful
    # it makes no difference, but hashsolo says to do all QC before running it, move things up 
    clr(np.array(dataframe.X.todense()) + 1)#clr transform the hto, doesn't this transform everything? Yes!
    df_hash.iloc[:,:] = res
    dataframe.obs = df_hash.loc[:, hashtag_names]
    for i in hashtag_names: 
        dataframe.obs[i]=dataframe.obs[i].astype(float).astype(np.int64)
    return dataframe

file_list = [
     sc.read_h5ad('h5ads/lib_1.h5ad'), 
     sc.read_h5ad('h5ads/lib_2.h5ad'), 
     sc.read_h5ad('h5ads/lib_3.h5ad')
]

labels = 28291*["lib1"] +  29401* ["lib2"]+ 29546*["lib3"]
new_l = []
for index in range(len(file_list)):
    adata = file_list[index]
    # prep for hashsolo -- only with protein
    adata = over_input(adata, adata.var_names[-12:]) # this is always slow
    # get hashsolo classsifications for each sample
    sce.pp.hashsolo(adata, list(adata.obs.columns)) 
    doublet_rate = adata.obs.Classification.value_counts()["Doublet"] / len(adata.obs.index)
    print(doublet_rate)
    print(adata.obs.Classification.value_counts())
    # remove doublets and negatives
    adata = adata[~adata.obs['Classification'].isin(['Doublet', 'Negative'])]
    # apply thaw map
    adata.obs["Thaw"] = adata.obs['Classification'].map(hashtag_to_thaw)
    adata.obs["batch"] = "lib_" + str(index)
    new_l.append(adata)
    
[print(file) for file in new_l]
pbmc_concat = ad.concat(new_l, merge="same")
#pbmc_concat.obs["batch"] = labels
pbmc_concat

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
0.32833763387649784
Classification
Doublet                                          9289
TotalSeq-C0251 anti-human Hashtag 1 Antibody     2578
TotalSeq-C0255 anti-human Hashtag 5 Antibody     2476
TotalSeq-C0259 anti-human Hashtag 9 Antibody     2411
TotalSeq-C0253 anti-human Hashtag 3 Antibody     1554
TotalSeq-C0257 anti-human Hashtag 7 Antibody     1546
TotalSeq-C0254 anti-human Hashtag 4 Antibody     1538
TotalSeq-C0260 anti-human Hashtag 10 Antibody    1476
TotalSeq-C0252 anti-human Hashtag 2 Antibody     1350
TotalSeq-C0256 anti-human Hashtag 6 Antibody     1335
TotalSeq-C0258 anti-human Hashtag 8 Antibody      963
TotalSeq-C0262 anti-human Hashtag 12 Antibody     946
TotalSeq-C0263 anti-human Hashtag 13 Antibody     724
Negative                                          105
Name: count, dtype: int64


  adata.obs["Thaw"] = adata.obs['Classification'].map(hashtag_to_thaw)


Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
0.3350906431754022
Classification
Doublet                                          9852
TotalSeq-C0251 anti-human Hashtag 1 Antibody     2651
TotalSeq-C0255 anti-human Hashtag 5 Antibody     2591
TotalSeq-C0259 anti-human Hashtag 9 Antibody     2425
TotalSeq-C0257 anti-human Hashtag 7 Antibody     1665
TotalSeq-C0253 anti-human Hashtag 3 Antibody     1605
TotalSeq-C0254 anti-human Hashtag 4 Antibody     1568
TotalSeq-C0260 anti-human Hashtag 10 Antibody    1450
TotalSeq-C0256 anti-human Hashtag 6 Antibody     1415
TotalSeq-C0252 anti-human Hashtag 2 Antibody     1350
TotalSeq-C0258 anti-human Hashtag 8 Antibody     1067
TotalSeq-C0262 anti-human Hashtag 12 Antibody     937
TotalSeq-C0263 anti-human Hashtag 13 Antibody     701
Negative                                          124
Name: count, dtype: int64


  adata.obs["Thaw"] = adata.obs['Classification'].map(hashtag_to_thaw)


Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2
0.3259324443241048
Classification
Doublet                                          9630
TotalSeq-C0255 anti-human Hashtag 5 Antibody     2714
TotalSeq-C0251 anti-human Hashtag 1 Antibody     2655
TotalSeq-C0259 anti-human Hashtag 9 Antibody     2418
TotalSeq-C0257 anti-human Hashtag 7 Antibody     1690
TotalSeq-C0260 anti-human Hashtag 10 Antibody    1598
TotalSeq-C0254 anti-human Hashtag 4 Antibody     1571
TotalSeq-C0253 anti-human Hashtag 3 Antibody     1561
TotalSeq-C0256 anti-human Hashtag 6 Antibody     1380
TotalSeq-C0252 anti-human Hashtag 2 Antibody     1379
TotalSeq-C0262 anti-human Hashtag 12 Antibody    1029
TotalSeq-C0258 anti-human Hashtag 8 Antibody     1017
TotalSeq-C0263 anti-human Hashtag 13 Antibody     767
Negative                                          137
Name: count, dtype: int64


  adata.obs["Thaw"] = adata.obs['Classification'].map(hashtag_to_thaw)


AnnData object with n_obs × n_vars = 18897 × 36750
    obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag 2 Antibody', 'TotalSeq-C0253 anti-human Hashtag 3 Antibody', 'TotalSeq-C0254 anti-human Hashtag 4 Antibody', 'TotalSeq-C0255 anti-human Hashtag 5 Antibody', 'TotalSeq-C0256 anti-human Hashtag 6 Antibody', 'TotalSeq-C0257 anti-human Hashtag 7 Antibody', 'TotalSeq-C0258 anti-human Hashtag 8 Antibody', 'TotalSeq-C0259 anti-human Hashtag 9 Antibody', 'TotalSeq-C0260 anti-human Hashtag 10 Antibody', 'TotalSeq-C0262 anti-human Hashtag 12 Antibody', 'TotalSeq-C0263 anti-human Hashtag 13 Antibody', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Thaw', 'batch'
    var: 'gene_ids', 'feature_types'
AnnData object with n_obs × n_vars = 19425 × 36750
    obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 58101 × 36750
    obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag 2 Antibody', 'TotalSeq-C0253 anti-human Hashtag 3 Antibody', 'TotalSeq-C0254 anti-human Hashtag 4 Antibody', 'TotalSeq-C0255 anti-human Hashtag 5 Antibody', 'TotalSeq-C0256 anti-human Hashtag 6 Antibody', 'TotalSeq-C0257 anti-human Hashtag 7 Antibody', 'TotalSeq-C0258 anti-human Hashtag 8 Antibody', 'TotalSeq-C0259 anti-human Hashtag 9 Antibody', 'TotalSeq-C0260 anti-human Hashtag 10 Antibody', 'TotalSeq-C0262 anti-human Hashtag 12 Antibody', 'TotalSeq-C0263 anti-human Hashtag 13 Antibody', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Thaw', 'batch'
    var: 'gene_ids', 'feature_types'

In [20]:
pbmc_concat

AnnData object with n_obs × n_vars = 58101 × 36750
    obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag 2 Antibody', 'TotalSeq-C0253 anti-human Hashtag 3 Antibody', 'TotalSeq-C0254 anti-human Hashtag 4 Antibody', 'TotalSeq-C0255 anti-human Hashtag 5 Antibody', 'TotalSeq-C0256 anti-human Hashtag 6 Antibody', 'TotalSeq-C0257 anti-human Hashtag 7 Antibody', 'TotalSeq-C0258 anti-human Hashtag 8 Antibody', 'TotalSeq-C0259 anti-human Hashtag 9 Antibody', 'TotalSeq-C0260 anti-human Hashtag 10 Antibody', 'TotalSeq-C0262 anti-human Hashtag 12 Antibody', 'TotalSeq-C0263 anti-human Hashtag 13 Antibody', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Thaw', 'batch'
    var: 'gene_ids', 'feature_types'

In [21]:
rna = pbmc_concat[:, pbmc_concat.var[:-149].index].copy() # make sure they are no longer connected
protein = pbmc_concat[:, pbmc_concat.var[-149:].index].copy()

In [22]:
rna, protein

(AnnData object with n_obs × n_vars = 58101 × 36601
     obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag 2 Antibody', 'TotalSeq-C0253 anti-human Hashtag 3 Antibody', 'TotalSeq-C0254 anti-human Hashtag 4 Antibody', 'TotalSeq-C0255 anti-human Hashtag 5 Antibody', 'TotalSeq-C0256 anti-human Hashtag 6 Antibody', 'TotalSeq-C0257 anti-human Hashtag 7 Antibody', 'TotalSeq-C0258 anti-human Hashtag 8 Antibody', 'TotalSeq-C0259 anti-human Hashtag 9 Antibody', 'TotalSeq-C0260 anti-human Hashtag 10 Antibody', 'TotalSeq-C0262 anti-human Hashtag 12 Antibody', 'TotalSeq-C0263 anti-human Hashtag 13 Antibody', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Thaw', 'batch'
     var: 'gene_ids', 'feature_types',
 AnnData object with n_obs × n_vars = 58101 × 149
     obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Has

In [24]:
rna_prot = [rna, protein]
# [sc.read_h5ad("h5ads/full_rna_after_qc.h5ad"),  
#             sc.read_h5ad("h5ads/full_protein_after_qc.h5ad")]

for i in range(len(rna_prot)):
    # add batch labels
    rna_prot[i].obs_names_make_unique()
    # elim all 0 zero count features
    sc.pp.filter_genes(rna_prot[i], min_counts=1)
    # protein qc
    if i == 1:
        # normalization described in cite seq paper
        pt.pp.clr(rna_prot[i])
    # rna qc
    else:
        # remove mitochondrial genes -- normal in scRNAseq 
        rna_prot[i].var["mt"] = rna_prot[i].var_names.str.startswith("MT-")
        # ribosomal genes
        rna_prot[i].var['ribo'] = rna_prot[i].var_names.str.startswith(("RPS","RPL"))
        # hemoglobin genes.
        rna_prot[i].var['hb'] = rna_prot[i].var_names.str.contains(("^HB[^(P)]"))
        sc.pp.calculate_qc_metrics(rna_prot[i], qc_vars=['mt','ribo','hb'], 
                                   percent_top=None, log1p=False, inplace=True)
        rna_prot[i].layers["counts"] = rna_prot[i].X.copy()
        # the actual filtering is likely the problem
        # filter for percent mito
        rna_prot[i] = rna_prot[i][rna_prot[i].obs['pct_counts_mt'] < 20, :]
        # filter for percent ribo > 0.05
        rna_prot[i] = rna_prot[i][rna_prot[i].obs['pct_counts_ribo'] > 5, :]
        # still need to apply normalization
        sc.pp.normalize_total(rna_prot[i], target_sum=1e4)
        
    # either modality need last normalization step
    sc.pp.log1p(rna_prot[i])
    # basic dim reduction for each modality
    sc.pp.pca(rna_prot[i])
    # batch correction
    sce.pp.harmony_integrate(rna_prot[i], 'batch') #
    # needs to be done for wnn step anyway
    #sc.pp.neighbors(rna_prot[i])

  view_to_actual(adata)
2023-06-22 15:58:50,153 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-06-22 15:59:40,440 - harmonypy - INFO - sklearn.KMeans initialization complete.
2023-06-22 15:59:40,723 - harmonypy - INFO - Iteration 1 of 10
2023-06-22 15:59:57,104 - harmonypy - INFO - Iteration 2 of 10
2023-06-22 16:00:15,884 - harmonypy - INFO - Iteration 3 of 10
2023-06-22 16:00:34,822 - harmonypy - INFO - Iteration 4 of 10
2023-06-22 16:00:54,339 - harmonypy - INFO - Iteration 5 of 10
2023-06-22 16:01:12,105 - harmonypy - INFO - Iteration 6 of 10
2023-06-22 16:01:29,098 - harmonypy - INFO - Iteration 7 of 10
2023-06-22 16:01:34,859 - harmonypy - INFO - Converged after 7 iterations
  warn("adata.X is sparse but not in CSC format. Converting to CSC.")
2023-06-22 16:01:45,116 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-06-22 16:02:14,998 - harmonypy - INFO - sklearn.KMeans initialization complete.
2023-06-22 16:02:15,331 - har

In [25]:
rna_prot[0], rna_prot[1]

(AnnData object with n_obs × n_vars = 47024 × 28608
     obs: 'TotalSeq-C0251 anti-human Hashtag 1 Antibody', 'TotalSeq-C0252 anti-human Hashtag 2 Antibody', 'TotalSeq-C0253 anti-human Hashtag 3 Antibody', 'TotalSeq-C0254 anti-human Hashtag 4 Antibody', 'TotalSeq-C0255 anti-human Hashtag 5 Antibody', 'TotalSeq-C0256 anti-human Hashtag 6 Antibody', 'TotalSeq-C0257 anti-human Hashtag 7 Antibody', 'TotalSeq-C0258 anti-human Hashtag 8 Antibody', 'TotalSeq-C0259 anti-human Hashtag 9 Antibody', 'TotalSeq-C0260 anti-human Hashtag 10 Antibody', 'TotalSeq-C0262 anti-human Hashtag 12 Antibody', 'TotalSeq-C0263 anti-human Hashtag 13 Antibody', 'most_likely_hypothesis', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', 'Classification', 'Thaw', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb'
     var: 'gene_ids', 'featu

In [33]:
subset_protein = rna_prot[1][rna_prot[0].obs.index, :]
sc.pp.neighbors(subset_protein)

In [34]:
rna_prot[0].write_h5ad('h5ads/correct_rna.h5ad')
subset_protein.write_h5ad('h5ads/correct_protein.h5ad')