# Whitelist construction

This notebook constructs whitelists for datasets based on existing annotations.

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import sparse
import matplotlib.pyplot as plt
import time
import os.path


## Allen

Single-cell

In [3]:
allen_membership = pd.read_csv('~/count_data/allen_metadata/sc/cluster.membership.csv',\
                               skiprows = 1, names=['barcode','cluster_id'])
allen_annot = pd.read_csv('~/count_data/allen_metadata/sc/cluster.annotation.csv')


First 78 clusters in annotations are OK quality, others aren't.

In [18]:
ok_clusters = allen_annot[~(allen_annot['class_label']=='Low Quality')]['cluster_id']
print(len(ok_clusters))

78


In [21]:
allen_membership['cell_barcode'] = allen_membership['barcode'].str[:16]
allen_membership['sample'] = allen_membership['barcode'].str[-3:]
allen_membership['cluster_id'] = allen_membership['cluster_id'].astype("category")


In [35]:
for sample in ['H12','G12','A01','F01','B01','C01','D01','E01','F08','G08','A08','B08']:
    bcs = allen_membership[(allen_membership['sample']==sample) & (allen_membership['cluster_id'].isin(ok_clusters))]['cell_barcode']
    print(f'Sample: {sample}. High-quality barcodes in annotations: {len(bcs)}.')
    np.savetxt(f'./whitelists/{sample}_wl.txt',np.asarray(bcs,dtype=str),fmt='%16s')


Sample: H12. High-quality barcodes in annotations: 5405.
Sample: G12. High-quality barcodes in annotations: 5256.
Sample: A01. High-quality barcodes in annotations: 4520.
Sample: F01. High-quality barcodes in annotations: 4011.
Sample: B01. High-quality barcodes in annotations: 7416.
Sample: C01. High-quality barcodes in annotations: 7624.
Sample: D01. High-quality barcodes in annotations: 4966.
Sample: E01. High-quality barcodes in annotations: 5271.
Sample: F08. High-quality barcodes in annotations: 6821.
Sample: G08. High-quality barcodes in annotations: 7560.
Sample: A08. High-quality barcodes in annotations: 5915.
Sample: B08. High-quality barcodes in annotations: 6418.


Single-nucleus

In [2]:
allen_membership = pd.read_csv('~/count_data/allen_metadata/sn/cluster.membership.csv',\
                               skiprows = 1, names=['barcode','cluster_id'])
allen_annot = pd.read_csv('~/count_data/allen_metadata/sn/cluster.annotation.csv')
ok_clusters = allen_annot[~(allen_annot['class_label']=='Low Quality')]['cluster_id']
print(len(ok_clusters))
allen_membership['cell_barcode'] = allen_membership['barcode'].str[:16]
allen_membership['sample'] = allen_membership['barcode'].str[-3:]
allen_membership['cluster_id'] = allen_membership['cluster_id'].astype("category")


67


In [3]:
for sample in ['A02']:
    bcs = allen_membership[(allen_membership['sample']==sample) & (allen_membership['cluster_id'].isin(ok_clusters))]['cell_barcode']
    print(f'Sample: {sample}. High-quality barcodes in annotations: {len(bcs)}.')
    np.savetxt(f'./whitelists/{sample}_wl.txt',np.asarray(bcs,dtype=str),fmt='%16s')


Sample: A02. High-quality barcodes in annotations: 8266.


## Andrews

In [20]:
meta_str = '/home/ggorin/datasets/liver_andrews/GSE185477_Final_Metadata.txt'
meta = pd.read_csv(meta_str,sep='\t')


In [26]:
bcs = np.asarray(meta[meta['sample']=='C72_RESEQ']['cell_barcode'],dtype=str)
np.savetxt('./whitelists/liver_c72_sc_wl.txt',bcs,fmt='%16s')
print(f'Sample: C72 single-cell. High-quality barcodes in annotations: {len(bcs)}.')

bcs = np.asarray(meta[meta['sample']=='C72_TST']['cell_barcode'],dtype=str)
np.savetxt('./whitelists/liver_c72_sn_wl.txt', bcs,fmt='%16s')
print(f'Sample: C72 single-nucleus. High-quality barcodes in annotations: {len(bcs)}.')


Sample: C72 single-cell. High-quality barcodes in annotations: 11219.
Sample: C72 single-nucleus. High-quality barcodes in annotations: 9054.


## 10x

Just use the cellranger results.

In [27]:
meta_str = '/home/ggorin/count_data/brain_5k_v3_cr/outs/filtered_feature_bc_matrix/barcodes.tsv'
meta = pd.read_csv(meta_str,sep='\t',header=None,names=['barcode'])
bcs = np.asarray(meta['barcode'].str[:-2],dtype=str)
np.savetxt('./whitelists/brain_5k_v3_wl.txt', bcs,fmt='%16s')
print(f'Sample: 10x 5k mouse brain cells. High-quality barcodes in annotations: {len(bcs)}.')


Sample: 10x 5k mouse brain cells. High-quality barcodes in annotations: 5483.


In [28]:
meta_str = '/home/ggorin/count_data/brain_nuc_5k_v3_cr/outs/filtered_feature_bc_matrix/barcodes.tsv'
meta = pd.read_csv(meta_str,sep='\t',header=None,names=['barcode'])
bcs = np.asarray(meta['barcode'].str[:-2],dtype=str)
np.savetxt('./whitelists/brain_nuc_5k_v3_wl.txt', bcs,fmt='%16s')
print(f'Sample: 10x 5k mouse brain nuclei. High-quality barcodes in annotations: {len(bcs)}.')


Sample: 10x 5k mouse brain nuclei. High-quality barcodes in annotations: 5899.
