# Assembling

This notebook is used to assemble a processable h5ad object for subsequent notebooks.

## Fill in input data, output and settings

In [None]:
################### DEFINING STRATEGY ###############################
from_h5ad = True #option 1
assembling_10_velocity = False #option 2
assembling_10_public = False #option 3
convert_seurat = False #option 4

###################### INPUT DATA ###################################

# For option 1: The path to an existing .h5ad file
h5ad_files = ['test_data/scatac_pbmc.h5ad']
merge_column = 'Sample_new' # (str) if multiple h5ad files merged this is an identifier from which file it originates
coordinate_cols = None # (list:str) columns where peak location data is stored (['chr', 'start', 'end'])
set_index = True       # (boolean) should the adata.var index be formatted, that it matches chr:start-stop
index_from = None    # (str) should the index be generated from a certain column. Otherwise this is None (SnapATAC: name)

# Manually set existing QC Columns
existing_qc_columns = []

#################### Calc QC Columns ###############################
# set available number of threads
n_threads = 10

# Step 2 & 3
use_bam = False # False if fragments should be used
barcode_tag = 'CB' # set the tag where to find the barcode to use (only for .bam)

# if Step 2, 3 or both is executed specify either a BAM file or a fragments file:
bam_file =  None # specify bamfile of the corresponding alignment
fragments_file = 'test_data/scatac_pbmc_fragments.tsv' # specify fragments file (if this is possible use the fragments file directly)

# Step 2: Calc metrics related to fragment length
calc_fragment_length_metrics = True # set True if the mean insertsize should be calculated

# Step 3: Promotor-enrichment
calc_promotor_enrichment = False # set True if the promotor-enrichment should be calculated
# specify promotors_gtf if another promoter than the fixed ones should be used
# if promotors is None please specify species
promoters_gtf = None
species = "human"

----------------

## Import modules

In [None]:
# sctoolbox modules 
import sctoolbox.calc_overlap_pct as overlap
import sctoolbox.tools.fld_scoring as fld
from sctoolbox.qc_filter import *
from sctoolbox.atac_utils import *
import sctoolbox.utils as utils

utils.settings_from_config("config.yaml", key="01")

## 1. Read in data

### Option 1: Read from .h5ad

In [None]:
if from_h5ad:
    adata = utils.assemble_from_h5ad(h5ad_files,
                       merge_column=merge_column,
                       coordinate_cols=coordinate_cols,
                       set_index=set_index,
                       index_from=index_from)

## Inspect adata

In [None]:
display(adata)

In [None]:
adata.var.head()

------------

## Calculate QC columns related to fragments

### Check barcode tag

In [None]:
if use_bam:
    check_barcode_tag(adata, bam_file, barcode_tag)

### 2. Calc mean insertsize and count if missing 

In [None]:
if calc_fragment_length_metrics:
    
    adata = fld.add_fld_metrics(adata,
                                    bam=bam_file,
                                    fragments=fragments_file,
                                    barcode_col=None,
                                    barcode_tag="CB",
                                    regions=None,
                                    peaks_thr_conv=1,
                                    wavelength=150,
                                    sigma=0.4,
                                    plot=True,
                                    save_plots=False,
                                    plot_sample=0,
                                    n_threads=12)
    
    
    adata.obs

In [None]:
adata.obs

### 3. Promotor enrichment 

In [None]:
if calc_promotor_enrichment:
    
    if use_bam:
        overlap.pct_fragments_overlap(adata, regions_file=promoters_gtf, bam_file=bam_file, cb_col=None,
                                  cb_tag='CB', regions_name='promoters', nproc=1, sort_bam=False, sort_regions=False, keep_fragments=False)
    else:
        overlap.pct_fragments_overlap(adata, regions_file=promoters_gtf, fragments_file=fragments_file, cb_col=None,
                                  cb_tag='CB', regions_name='promoters', nproc=1, sort_bam=False, sort_regions=False, keep_fragments=False)

    if calc_fragment_length_metrics:
        adata.obs.pop('n_total_fragments')
    else:
        adata.obs.rename(columns={'n_total_fragments': 'genome_counts'}, inplace=True)
        qc_columns['genome_counts'] = 'genome_counts'
        #build_legend(adata, 'genome_counts', 'genome_counts')
        

-------------

## Save adata to .h5ad

In [None]:
#Overview of loaded adata
display(adata)

In [None]:
#Saving the data
adata_output = "anndata_1.h5ad"
utils.save_h5ad(adata, adata_output)