## Notebook to inspect the output from a the Cellbender runs of pooled GEX and non-pooled ARC samples

- see how many of the predicted empty cells (ambient RNA) actually passed Cellranger filtering and are uncertain in cell-type assignment

- Cellbender was run on cloud using google life sciences batch submission with Cellbender WDL

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
from os.path import exists
from pandas import DataFrame, concat
from seaborn import barplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

#### set notebook variables

In [None]:
# naming
project = 'aging_phase2'

# directories for initial setup
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
demux_dir = f'{wrk_dir}/demux'
cellbend_dir = f'{wrk_dir}/cellbender'
info_dir = f'{wrk_dir}/sample_info'
src_data_dir = f'{wrk_dir}/src_data/'

# in files
info_file = f'{info_dir}/{project}.sample_info.csv'

# out files

# constants
DEBUG = False
lane_range = range(1, 9)

### load data

### load the sample info data

In [None]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())
    display(info_df.gex_pool.value_counts())
    display(info_df.loc[info_df.gex_pool.isna()].sample_id.unique())

##### setup the pool names for file finding

In [None]:
cellbender_names = []
for pool in info_df.loc[~info_df.gex_pool.isna()].gex_pool.unique():
    for lane in lane_range:
        pool_name = f'sample_ec_GEX_P{int(pool)}_{lane}'
        cellbender_names.append(pool_name)
unpooled_info_df = info_df.loc[info_df.gex_pool.isna()]
for row in unpooled_info_df.itertuples():
    arc_name = row.sample_id.replace('Aging', 'Ag') + '_arc'
    cellbender_names.append(arc_name)
print(f'{len(cellbender_names)} named Cellbender results expected')
if DEBUG:
    display(cellbender_names)

#### load Cellbender results

In [None]:
cellbender_barcodes = {}
for sample in cellbender_names:
    this_file = f'{cellbend_dir}/{sample}_out_cell_barcodes.csv'
    if exists(this_file):
        cb_barcodes = read_csv(this_file, header=None)
        print(sample, cb_barcodes.shape[0])
        cellbender_barcodes[sample] = cb_barcodes

#### load the demx results to get 10X and demux filtered results
here can just go ahead and load demux best files since demux is run based on the 10X filtered barcodes

In [None]:
demux_barcodes = {}
for sample in cellbender_barcodes:
    new_name = sample.replace('sample_ec_', '')
    this_file = f'{demux_dir}/{new_name}.best'
    if exists(this_file):
        these_barcodes = read_csv(this_file, sep='\t')
        print(new_name, these_barcodes.shape)
        demux_barcodes[new_name] = these_barcodes

### determine number of possible empty cells not filtered by 10X
and those that didn't drop out during genotype demultiplexing

In [None]:
tenx_empties = {}
tenx_fracs = {}
demux_empties = {}
demux_fracs = {}
for sample in demux_barcodes:
    print(sample)
    cb_cells = set(cellbender_barcodes.get(f'sample_ec_{sample}')[0].values)
    demux_data = demux_barcodes.get(sample)
    demux_assigned = demux_data.loc[demux_data['DROPLET.TYPE'] == 'SNG']
    tenx_cells = set(demux_data.BARCODE.values)
    demux_cells = set(demux_assigned.BARCODE.values)
    tenx_empties[sample] = tenx_cells - cb_cells
    demux_empties[sample] = demux_cells - cb_cells
    tenx_fracs[sample] = round(len(tenx_empties[sample])/len(tenx_cells), 3)
    demux_fracs[sample] = round(len(demux_empties[sample])/len(demux_cells), 3)
    if DEBUG:
        print(f'10x empties retained = {len(tenx_empties[sample])}')
        print(f'10x empties retained fraction = {tenx_fracs[sample]}')
        print(f'demuxlet empties retained = {len(demux_empties[sample])}')
        print(f'demuxlet empties retained fraction = {demux_fracs[sample]}')

#### add in the non-pooled ARC samples

In [None]:
for sample in cellbender_names:
    arc_name = sample.replace('Aging', 'Ag')
    if exists(f'{src_data_dir}/arc/{arc_name}'):
        print(sample)
        arc_barcodes = read_csv((f'{src_data_dir}/arc/{arc_name}/outs/'
                                 f'filtered_feature_bc_matrix/barcodes.tsv.gz'),
                                header=None)
        arc_barcodes.columns = ['barcode']
        print(sample, arc_barcodes.shape)
        cb_cells = set(cellbender_barcodes.get(sample)[0])
        tenx_cells = set(arc_barcodes.barcode)
        tenx_empties[sample] = tenx_cells - cb_cells
        tenx_fracs[sample] = round(len(tenx_empties[sample])/len(tenx_cells), 3)
        if DEBUG:
            print(f'10x empties retained = {len(tenx_empties[sample])}')
            print(f'10x empties retained fraction = {tenx_fracs[sample]}')

### visualize the retained empties

In [None]:
with rc_context({'figure.figsize': (9, 9)}):
    df1 = DataFrame.from_dict(tenx_fracs, orient='index', columns=['frac'])
    df1['type'] = '10X filtered'
    df2 = DataFrame.from_dict(demux_fracs, orient='index', columns=['frac'])
    df2['type'] = 'demuxlet SNG'
    df = concat([df1, df2])
    df['percent'] = df.frac * 100
    display(df.groupby('type').percent.mean())
    barplot(x=df.index, y='percent', hue='type', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('percent empty')
    plt.title('Retained ambient RNA post 10X and demuxlet filtering')
    plt.show()

### did the large portion of empties filtered by demuxlet go to AMB or DBL cell-types

In [None]:
sng_fracs = {}
amb_fracs = {}
dbl_fracs = {}
for sample in demux_barcodes:
    print(sample)
    cb_cells = set(cellbender_barcodes.get(f'sample_ec_{sample}')[0].values)
    demux_data = demux_barcodes.get(sample)
    tenx_cells = set(demux_data.BARCODE.values)
    demux_sng = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'SNG'].BARCODE.values)
    demux_amb = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'AMB'].BARCODE.values)
    demux_dbl = set(demux_data.loc[demux_data['DROPLET.TYPE'] == 'DBL'].BARCODE.values)
    sng_fracs[sample] = round(len(demux_sng - cb_cells)/len(demux_sng), 3)
    amb_fracs[sample] = round(len(demux_amb - cb_cells)/len(demux_amb), 3)
    dbl_fracs[sample] = round(len(demux_dbl - cb_cells)/len(demux_dbl), 3)
    if DEBUG:
        print(f'singlet empties retained fraction = {sng_fracs[sample]}')
        print(f'ambiguous empties retained fraction = {amb_fracs[sample]}')
        print(f'doublet empties retained fraction = {dbl_fracs[sample]}')

#### visualize where they went

In [None]:
with rc_context({'figure.figsize': (9, 9)}):
    df1 = DataFrame.from_dict(sng_fracs, orient='index', columns=['frac'])
    df1['type'] = 'SNG'
    df2 = DataFrame.from_dict(amb_fracs, orient='index', columns=['frac'])
    df2['type'] = 'AMB'
    df3 = DataFrame.from_dict(dbl_fracs, orient='index', columns=['frac'])
    df3['type'] = 'DBL'
    df = concat([df1, df2, df3])
    df['percent'] = df.frac * 100
    display(df.groupby('type').percent.mean())
    barplot(x=df.index, y='percent', hue='type', data=df, palette='Purples')
    plt.grid(axis='y')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.ylabel('percent empty')
    plt.title('demuxlet assignment of empty cells')
    plt.show()

In [None]:
!date