## Notebook to run cellranger-arc aggr over the per sample cellranger-arc count data

- [Aggregating Multiple GEM Wells with cellranger-arc aggr](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/aggr)
- NISC ran Cellrnager-arc v2

In [1]:
!date

Fri Jul 28 17:18:46 EDT 2023


#### import libraries

In [2]:
from pandas import DataFrame
from os.path import exists
from multiprocessing import cpu_count

#### set notebook variables

In [3]:
# naming
proj_name = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
public_dir = f'{wrk_dir}/public'
src_dir = f'{wrk_dir}/src_data/arc'

# in files
reference_path = f'{public_dir}/refdata-cellranger-arc-GRCh38-2020-A-2.0.0'
celranger_arc_path = '~/cellranger-arc-2.0.1/cellranger-arc'
peaks_bed = f'{wrk_dir}/src_data/{proj_name}_consensus_atac_peaks.bed'

# out files
aggr_setup_file = f'{src_dir}/{proj_name}.cellranger_arc_aggr.csv'

# variables
samples = ['Ag119', 'Ag120', 'Ag121']
# dropped Ag122 problem with this sample, had 20000 cells 
# and tsne was just single blob for both GEX and ATAC
DEBUG = True

max_cpu = int(cpu_count()/2)
max_mem_gb = 512

### create and save the aggr input sample file

In [4]:
library_ids = []
fragments = []
metrics = []
molecules = []
for sample in samples:
    sample_dir = f'{src_dir}/{sample}_arc/outs'
    if exists(sample_dir):
        # print(subject, region)
        library_ids.append(f'{sample}')
        fragments.append(f'{sample_dir}/atac_fragments.tsv.gz')
        metrics.append(f'{sample_dir}/per_barcode_metrics.csv')
        molecules.append(f'{sample_dir}/gex_molecule_info.h5')            
    else:
        print(f'did not find arc out for {sample}')

In [5]:
template_df = DataFrame({
    'library_id': library_ids,
    'atac_fragments': fragments,
    'per_barcode_metrics': metrics,
    'gex_molecule_info': molecules})

print(template_df.shape)
if DEBUG:
    display(template_df.head())
    
template_df.to_csv(aggr_setup_file, index=False)

(3, 4)


Unnamed: 0,library_id,atac_fragments,per_barcode_metrics,gex_molecule_info
0,Ag119,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
1,Ag120,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
2,Ag121,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...


### format cmd to run the cellranger job

In [9]:
if exists(peaks_bed):
    this_cmd = f'cd {src_dir}; \
{celranger_arc_path} aggr --id={proj_name}_consensus_aggr --csv={aggr_setup_file} \
--reference {reference_path} \
--peaks={peaks_bed} \
--normalize=none \
--nosecondary \
--description={proj_name} \
--localcores={max_cpu} \
--localmem={max_mem_gb}'    
else:
    this_cmd = f'cd {src_dir}; \
{celranger_arc_path} aggr --id={proj_name}_arc_aggr --csv={aggr_setup_file} \
--reference {reference_path} \
--normalize=none \
--nosecondary \
--description={proj_name} \
--localcores={max_cpu} \
--localmem={max_mem_gb}'

print(this_cmd)

cd /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/arc; ~/cellranger-arc-2.0.1/cellranger-arc aggr --id=aging_phase2_consensus_aggr --csv=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/arc/aging_phase2.cellranger_arc_aggr.csv --reference /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 --peaks=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/aging_phase2_consensus_atac_peaks.bed --normalize=none --nosecondary --description=aging_phase2 --localcores=32 --localmem=512


In [None]:
!date