## Notebook to run cellranger-atac aggr over the per sample ATAC cellranger-arc count data

- [Aggregating Multiple GEM Wells with cellranger-atac aggr](https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/using/aggr)
- NISC ran Cellranger-atac v2.0.0

In [1]:
!date

Fri Jul 28 17:22:30 EDT 2023


#### import libraries

In [2]:
from pandas import DataFrame
from os.path import exists
from multiprocessing import cpu_count

#### set notebook variables

In [3]:
# naming
proj_name = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
public_dir = f'{wrk_dir}/public'
src_dir = f'{wrk_dir}/src_data/atac'

# in files
reference_path = f'{public_dir}/refdata-cellranger-arc-GRCh38-2020-A-2.0.0'
celranger_atac_path = '~/cellranger-atac-2.1.0/cellranger-atac'
peaks_bed = f'{wrk_dir}/src_data/{proj_name}_consensus_atac_peaks.bed'

# out files
aggr_setup_file = f'{src_dir}/{proj_name}.cellranger_atac_aggr.csv'

# variables
pools = []
DEBUG = True

max_cpu = int(cpu_count()/2)
max_mem_gb = 512

### create and save the aggr input sample file

In [4]:
library_ids = []
fragments = []
cells = []
for pool in [1, 2, 3, 6, 7]:
    for well in range(1, 9):
        pool_id = f'ATAC_P{pool}_{well}'
        sample_dir = f'{src_dir}/sample_ec_{pool_id}/outs'
        if exists(sample_dir):
            # print(subject, region)
            library_ids.append(f'{pool_id}')
            fragments.append(f'{sample_dir}/fragments.tsv.gz')
            cells.append(f'{sample_dir}/singlecell.csv')
        else:
            print(f'did not find ATAC out for {pool_id}')

did not find ATAC out for ATAC_P1_7
did not find ATAC out for ATAC_P1_8
did not find ATAC out for ATAC_P2_7
did not find ATAC out for ATAC_P2_8
did not find ATAC out for ATAC_P7_5
did not find ATAC out for ATAC_P7_6
did not find ATAC out for ATAC_P7_7
did not find ATAC out for ATAC_P7_8


In [5]:
template_df = DataFrame({
    'library_id': library_ids,
    'fragments': fragments,
    'cells': cells})

print(template_df.shape)
if DEBUG:
    display(template_df.head())
    
template_df.to_csv(aggr_setup_file, index=False)

(32, 3)


Unnamed: 0,library_id,fragments,cells
0,ATAC_P1_1,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
1,ATAC_P1_2,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
2,ATAC_P1_3,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
3,ATAC_P1_4,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...
4,ATAC_P1_5,/labshare/raph/datasets/adrd_neuro/brain_aging...,/labshare/raph/datasets/adrd_neuro/brain_aging...


### format cmd to run the cellranger job

In [6]:
if exists(peaks_bed):
    this_cmd = f'cd {src_dir}; \
{celranger_atac_path} aggr --id={proj_name}_consensus_aggr --csv={aggr_setup_file} \
--reference {reference_path} \
--peaks={peaks_bed} \
--normalize=none \
--nosecondary \
--description={proj_name} \
--localcores={max_cpu} \
--localmem={max_mem_gb}'
else:
    this_cmd = f'cd {src_dir}; \
{celranger_atac_path} aggr --id={proj_name}_atac_aggr --csv={aggr_setup_file} \
--reference {reference_path} \
--normalize=none \
--nosecondary \
--description={proj_name} \
--localcores={max_cpu} \
--localmem={max_mem_gb}'

print(this_cmd)

cd /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/atac; ~/cellranger-atac-2.1.0/cellranger-atac aggr --id=aging_phase2_consensus_aggr --csv=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/atac/aging_phase2.cellranger_atac_aggr.csv --reference /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 --peaks=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/src_data/aging_phase2_consensus_atac_peaks.bed --normalize=none --nosecondary --description=aging_phase2 --localcores=32 --localmem=512


In [7]:
!date

Fri Jul 28 17:22:31 EDT 2023
