## Notebook to inspect the output from a couple of Cellbender runs a sample of pools

- see how many of the predicted empty cells (ambient RNA) actually passed Cellranger filtering and are uncertain in cell-type assignment

- Cellbender was run on cloud using google life sciences batch submission with Cellbender WDL

In [None]:
!date

#### import libraries

In [None]:
from scanpy import read_h5ad
from pandas import read_csv
from os.path import exists
from pandas import DataFrame, concat
from seaborn import barplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

#### set notebook variables

In [None]:
# naming
cohort = 'aging'

# directories for initial setup
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
quants_dir = f'{wrk_dir}/demux'
cellbend_dir = f'{wrk_dir}/cellbender'
info_dir = f'{wrk_dir}/sample_info'

# in files
anndata_file = f'{quants_dir}/{cohort}.pegasus.leiden_085.subclustered.h5ad'
info_file = f'{info_dir}/{cohort}.pool_patient_sample_info.csv'

# out files

# constants
DEBUG = True
lane_range = range(1, 9)

### load data

### load the sample info data

In [None]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())
    display(info_df.pool_name.value_counts())

##### setup the pool names for file finding

In [None]:
pool_names = []
for pool in info_df.pool_name.unique():
    for lane in lane_range:
        pool_name = f'Aging_{pool}_SCRN_{lane}'
        pool_names.append(pool_name)
print(f'{len(pool_names)} named pools expected')
if DEBUG:
    print(pool_names)

#### load Cellbender results

In [None]:
cellbender_barcodes = {}
for pool in pool_names:
    this_file = f'{cellbend_dir}/{pool}_out_cell_barcodes.csv'
    if exists(this_file):
        cb_barcodes = read_csv(this_file, header=None)
        print(pool, cb_barcodes.shape[0])
        cellbender_barcodes[pool] = cb_barcodes

#### load the demx results to get 10X and demux filtered results
here can just go ahead and load demux best files since demux is run based on the 10X filtered barcodes

In [None]:
demux_barcodes = {}
for pool in cellbender_barcodes:
    this_file = f'{quants_dir}/{pool}.best'
    these_barcodes = read_csv(this_file, sep='\t')
    print(pool, these_barcodes.shape)
    demux_barcodes[pool] = these_barcodes    

### determine number of possible empty cells not filtered by 10X
and those that didn't drop out during genotype demultiplexing

In [None]:
tenx_empties = {}
tenx_fracs = {}
demux_empties = {}
demux_fracs = {}
for pool in cellbender_barcodes:
    print(pool)
    cb_cells = set(cellbender_barcodes.get(pool)[0].values)
    demux_data = demux_barcodes.get(pool)
    demux_assigned = demux_data.loc[demux_data['DROPLET.TYPE'] == 'SNG']
    tenx_cells = set(demux_data.BARCODE.values)
    demux_cells = set(demux_assigned.BARCODE.values)
    tenx_empties[pool] = tenx_cells - cb_cells
    demux_empties[pool] = demux_cells - cb_cells
    tenx_fracs[pool] = round(len(tenx_empties[pool])/len(tenx_cells), 3)
    demux_fracs[pool] = round(len(demux_empties[pool])/len(demux_cells), 3)
    if DEBUG:
        print(f'10x empties retained = {len(tenx_empties[pool])}')        
        print(f'10x empties retained fraction = {tenx_fracs[pool]}')    
        print(f'demuxlet empties retained = {len(demux_empties[pool])}')    
        print(f'demuxlet empties retained fraction = {demux_fracs[pool]}')    

### visualize the retained empties

In [None]:
with rc_context({'figure.figsize': (9, 9)}):  
    df1 = DataFrame.from_dict(tenx_fracs, orient='index', columns=['frac'])
    df1['type'] = '10X filtered'
    df2 = DataFrame.from_dict(demux_fracs, orient='index', columns=['frac'])
    df2['type'] = 'demuxlet SNG'
    df = concat([df1, df2])
    df['percent'] = df.frac * 100
    barplot(x=df.index, y='percent', hue='type', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('percent empty')
    plt.title('Retained ambient RNA post 10X and demuxlet filtering')
    plt.show()

### did the large portion of empties filtered by demuxlet go to AMB or DBL cell-types

In [None]:
sng_fracs = {}
amb_fracs = {}
dbl_fracs = {}
for pool in cellbender_barcodes:
    print(pool)
    cb_cells = set(cellbender_barcodes.get(pool)[0].values)
    demux_data = demux_barcodes.get(pool)
    tenx_cells = set(demux_data.BARCODE.values)
    demux_sng = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'SNG'].BARCODE.values)
    demux_amb = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'AMB'].BARCODE.values)
    demux_dbl = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'DBL'].BARCODE.values)    
    sng_fracs[pool] = round(len(demux_sng - cb_cells)/len(demux_sng), 3)
    amb_fracs[pool] = round(len(demux_amb - cb_cells)/len(demux_amb), 3)
    dbl_fracs[pool] = round(len(demux_dbl - cb_cells)/len(demux_dbl), 3)
    if DEBUG:
        print(f'singlet empties retained fraction = {sng_fracs[pool]}')    
        print(f'ambiguous empties retained fraction = {amb_fracs[pool]}')    
        print(f'doublet empties retained fraction = {dbl_fracs[pool]}')

#### visualize where they went

In [None]:
with rc_context({'figure.figsize': (9, 9)}):  
    df1 = DataFrame.from_dict(sng_fracs, orient='index', columns=['frac'])
    df1['type'] = 'SNG'
    df2 = DataFrame.from_dict(amb_fracs, orient='index', columns=['frac'])
    df2['type'] = 'AMB'
    df3 = DataFrame.from_dict(dbl_fracs, orient='index', columns=['frac'])
    df3['type'] = 'DBL'    
    df = concat([df1, df2, df3])
    df['percent'] = df.frac * 100
    barplot(x=df.index, y='percent', hue='type', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('percent empty')
    plt.title('demuxlet assignment of empty cells')
    plt.show()

### load the fully processed anndata file
determine for the relatively small portion of possible empty cells retained how many are an undetermined cell-type

In [None]:
%%time
adata = read_h5ad(anndata_file)
print(adata)
if DEBUG:
    display(adata.obs.head())

### get the uncertain cells

In [None]:
found_uncertain = [x for x in adata.obs['new_anno'].unique().to_list() 
                   if 'uncertain' in x]
print(found_uncertain)
adata_uncertain = adata[adata.obs['new_anno'].isin(found_uncertain ), :]
print(adata_uncertain)

#### how many of the retained Cellbender possible empties are also uncertain cell-type

In [None]:
pool_fracs_uncertain = {}
for pool in cellbender_barcodes:
    this_percent = 0.0
    these_empties = demux_empties[pool]
    if these_empties:
        temp = adata[adata.obs.index.isin(these_empties)]
        this_percent = round(temp.obs.loc[temp.obs.new_anno.isin(found_uncertain)].shape[0]/temp.obs.shape[0], 3)
        pool_fracs_uncertain[pool] = this_percent
        if DEBUG:
            display(temp.obs.new_anno.value_counts())
    else:
        pool_fracs_uncertain[pool] = 0.0
    print(pool, this_percent)

In [None]:
# shortened length as two entries didn't have any ambient RNA detected
round(sum(pool_fracs_uncertain.values())/(len(pool_fracs_uncertain)-2)*100, 3)

### visualize the how many of the empties were assigned to an uncertain cell-type

In [None]:
with rc_context({'figure.figsize': (9, 9)}):  
    df = DataFrame.from_dict(pool_fracs_uncertain, orient='index', columns=['frac'])
    df['percent'] = df.frac * 100
    df = df.sort_values('percent')
    barplot(x=df.index, y='percent', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('percent empty')
    plt.title('Percentage of empties that are uncertain cell-type')
    plt.show()

In [None]:
pool_counts_uncertain = {}
total_count = 0
for pool in cellbender_barcodes:
    this_count = 0
    these_empties = demux_empties[pool]
    total_count += len(these_empties)
    if these_empties:
        temp = adata[adata.obs.index.isin(these_empties)]
        this_count = temp.obs.loc[temp.obs.new_anno.isin(found_uncertain)].shape[0]
        pool_counts_uncertain[pool] = this_count
    else:
        pool_counts_uncertain[pool] = 0
    print(pool, this_count)
print(f'{total_count} cells are ambient RNA cells')

In [None]:
with rc_context({'figure.figsize': (9, 9)}):  
    df = DataFrame.from_dict(pool_counts_uncertain, orient='index', columns=['count'])
    df = df.sort_values('count')
    barplot(x=df.index, y='count', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('empty count')
    plt.title('Numbers of empties that are uncertain cell-type')
    plt.show()

### how many uncertain cells remain if all Cellbender empties are removed

In [None]:
all_demux_empties = set()
for pool in cellbender_barcodes:
    all_demux_empties = all_demux_empties | demux_empties[pool]
    
uncertain_obs = adata.obs.loc[adata.obs.new_anno.isin(found_uncertain)]
empty_uncerain_obs = uncertain_obs.loc[uncertain_obs.index.isin(all_demux_empties)]
print(f'{round(empty_uncerain_obs.shape[0]/uncertain_obs.shape[0], 3)*100} % of uncertain cells are possibly empty')

In [None]:
!date