In [2]:
import numpy as np
import scanpy as sc
import scipy
from scipy.sparse import csr_matrix
import scanpy.external as sce
import pandas as pd
import matplotlib.pyplot as plt
from composition_stats import clr
import anndata as ad
from muon import prot as pt

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [7]:
from anndata import AnnData

hashtag_to_thaw = {
    "TotalSeq-C0251 anti-human Hashtag 1 Antibody" : "M-A3-Unsti",
    "TotalSeq-C0255 anti-human Hashtag 5 Antibody" : "U-A3-Unsti", 
    "TotalSeq-C0259 anti-human Hashtag 9 Antibody" : "Z-A3-Unsti",  
    "TotalSeq-C0253 anti-human Hashtag 3 Antibody" : "M-A3-6th",    
    "TotalSeq-C0254 anti-human Hashtag 4 Antibody" : "M-A3-9th",  
    "TotalSeq-C0257 anti-human Hashtag 7 Antibody" : "U-A3-7th",
    "TotalSeq-C0260 anti-human Hashtag 10 Antibody": "Z-A3-1st",
    "TotalSeq-C0252 anti-human Hashtag 2 Antibody" : "M-A3-3rd",
    "TotalSeq-C0256 anti-human Hashtag 6 Antibody" : "U-A3-4th",
    "TotalSeq-C0258 anti-human Hashtag 8 Antibody" : "U-A3-10th",
    "TotalSeq-C0262 anti-human Hashtag 12 Antibody": "Z-A3-5th",
    "TotalSeq-C0263 anti-human Hashtag 13 Antibody": "Z-A3-9th"
}

def over_input(dataframe, hashtag_names): #Hashtag names must be a list
    df_hash = pd.DataFrame(columns=dataframe.var_names, index=dataframe.obs.index)
    res = np.array(dataframe.X.todense()) # this step is very wasteful
    # it makes no difference, but hashsolo says to do all QC before running it, move things up 
    clr(np.array(dataframe.X.todense()) + 1)#clr transform the hto, doesn't this transform everything? Yes!
    df_hash.iloc[:,:] = res
    dataframe.obs = df_hash.loc[:, hashtag_names]
    for i in hashtag_names: 
        dataframe.obs[i]=dataframe.obs[i].astype(float).astype(np.int64)
    return dataframe

file_list = [
     sc.read_h5ad('h5ads/lib_1.h5ad'), 
     sc.read_h5ad('h5ads/lib_2.h5ad'), 
     sc.read_h5ad('h5ads/lib_3.h5ad')
]
[print(file.var.feature_types[-12:]) for file in file_list]
# figure out how to get the TCRs separate from the normal RNA expression

raise Error

labels = 28291*["lib1"] +  29401* ["lib2"]+ 29546*["lib3"]
new_l = []
for index in range(len(file_list)):
    adata = file_list[index]
    # prep for hashsolo -- only with protein
    adata = over_input(adata, adata.var_names[-12:]) # this is always slow
    #get hashsolo classsifications for each sample
    sce.pp.hashsolo(adata, list(adata.obs.columns)) 
    doublet_rate = adata.obs.Classification.value_counts()["Doublet"] / len(adata.obs.index)
    print(doublet_rate)
    print(adata.obs.Classification.value_counts())
    #remove doublets and negatives -- remove doublets later??
    #adata = adata[~adata.obs['Classification'].isin(['Doublet', 'Negative'])]
    #apply thaw map
    adata.obs["Thaw"] = adata.obs['Classification'].map(hashtag_to_thaw)
    # add batch labels
    #adata.obs["batch"] = "lib"+str(index+1)
    new_l.append(adata)
    
#[print(file) for file in new_l]
pbmc_concat = ad.concat(new_l, merge="same")
pbmc_concat.obs["batch"] = labels
rna = pbmc_concat[:, pbmc_concat.var[:-149].index].copy() # make sure they are no longer connected
protein = pbmc_concat[:, pbmc_concat.var[-149:].index].copy()
rna.write_h5ad('h5ads/full_rna_after_qc.h5ad')
protein.write_h5ad('h5ads/full_protein_after_qc.h5ad')

TotalSeq-C0251 anti-human Hashtag 1 Antibody     Antibody Capture
TotalSeq-C0252 anti-human Hashtag 2 Antibody     Antibody Capture
TotalSeq-C0253 anti-human Hashtag 3 Antibody     Antibody Capture
TotalSeq-C0254 anti-human Hashtag 4 Antibody     Antibody Capture
TotalSeq-C0255 anti-human Hashtag 5 Antibody     Antibody Capture
TotalSeq-C0256 anti-human Hashtag 6 Antibody     Antibody Capture
TotalSeq-C0257 anti-human Hashtag 7 Antibody     Antibody Capture
TotalSeq-C0258 anti-human Hashtag 8 Antibody     Antibody Capture
TotalSeq-C0259 anti-human Hashtag 9 Antibody     Antibody Capture
TotalSeq-C0260 anti-human Hashtag 10 Antibody    Antibody Capture
TotalSeq-C0262 anti-human Hashtag 12 Antibody    Antibody Capture
TotalSeq-C0263 anti-human Hashtag 13 Antibody    Antibody Capture
Name: feature_types, dtype: category
Categories (2, object): ['Antibody Capture', 'Gene Expression']
TotalSeq-C0251 anti-human Hashtag 1 Antibody     Antibody Capture
TotalSeq-C0252 anti-human Hashtag 2 Antib

NameError: name 'Error' is not defined

In [10]:
rna_prot = [sc.read_h5ad("h5ads/full_rna_after_qc.h5ad"),  
            sc.read_h5ad("h5ads/full_protein_after_qc.h5ad")]

# looks good! just update the qc stuff and try again

for i in range(len(rna_prot)):
    # add batch labels
    #rna_prot[i].obs["batch"] = labels
    rna_prot[i].obs_names_make_unique() # this is probably not an inplace operation
    # elim all 0 zero count features
    sc.pp.filter_genes(rna_prot[i], min_counts=1)
    # protein qc
    if i == 1:
        # normalization described in cite seq paper
        pt.pp.clr(rna_prot[i])
    # rna qc
    else:
        # mitochondrial genes
        sc.pp.filter_genes(rna_prot[i], min_counts=1)
        rna_prot[i].var['mt'] = rna_prot[i].var_names.str.startswith('MT-') 
        # ribosomal genes
        rna_prot[i].var['ribo'] = rna_prot[i].var_names.str.startswith(("RPS","RPL"))
        # hemoglobin genes.
        rna_prot[i].var['hb'] = rna_prot[i].var_names.str.contains(("^HB[^(P)]"))

        sc.pp.calculate_qc_metrics(rna_prot[i], qc_vars=['mt','ribo','hb'], percent_top=None, log1p=False, inplace=True)
        # this part depends highly on library prep used 
    #     sc.pp.filter_cells(adata, min_genes=200)
    #     sc.pp.filter_genes(adata, min_cells=3)
        # depends highly on the distribution of data, only thing vaguely reliable between notebooks I've seen
        # filter for percent mito
        rna_prot[i] = rna_prot[i][rna_prot[i].obs['pct_counts_mt'] < 20, :]
        # filter for percent ribo > 0.05
        rna_prot[i] = rna_prot[i][rna_prot[i].obs['pct_counts_ribo'] > 5, :]
        # still need to apply normalization
        sc.pp.normalize_total(rna_prot[i], target_sum=1e4)

    # either modality need last normalization step
    sc.pp.log1p(rna_prot[i])
    # basic dim reduction for each modality
    sc.pp.pca(rna_prot[i])
    # batch correction
    sce.pp.harmony_integrate(rna_prot[i], 'batch') #
    # needs to be done for wnn step anyway
    sc.pp.neighbors(rna_prot[i]) 

# rna_prot[0].write_h5ad('h5ads/rna_doublet_last.h5ad')
# rna_prot[1].write_h5ad('h5ads/protein_doublet_last.h5ad')
rna_prot[0].write_h5ad('h5ads/true_rna_after_qc.h5ad')
rna_prot[1].write_h5ad('h5ads/true_protein_after_qc.h5ad')

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  view_to_actual(adata)
2023-06-22 13:15:39,243 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-06-22 13:16:13,489 - harmonypy - INFO - sklearn.KMeans initialization complete.
2023-06-22 13:16:13,878 - harmonypy - INFO - Iteration 1 of 10
2023-06-22 13:16:44,709 - harmonypy - INFO - Iteration 2 of 10
2023-06-22 13:17:15,005 - harmonypy - INFO - Iteration 3 of 10
2023-06-22 13:17:45,426 - harmonypy - INFO - Iteration 4 of 10
2023-06-22 13:18:14,902 - harmonypy - INFO - Iteration 5 of 10
2023-06-22 13:18:43,786 - harmonypy - INFO - Iteration 6 of 10
2023-06-22 13:19:10,611 - harmonypy - INFO - Iteration 7 of 10
2023-06-22 13:19:21,111 - harmonypy - INFO - Converged after 7 iterations
  warn("adata.X is sparse but not in CSC format. Converting to CSC.")
2023-06-22 13:19:36,327 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2023-06-22 13:20:20,027 - harmonypy - IN