# Assembling

This notebook is used to assemble a processable h5ad object for subsequent notebooks.
<hr style="border:2px solid black"> </hr>

In [None]:
from sctoolbox.utils import bgcolor

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

################### DEFINING STRATEGY ###############################
from_h5ad = True #option 1
assembling_10_velocity = False #option 2
assembling_10_public = False #option 3
convert_seurat = False #option 4

###################### INPUT DATA ###################################

# For option 1: The path to an existing .h5ad file
h5ad_file = 'test_data/scatac_pbmc.h5ad'
coordinate_cols = ['chr', 'start', 'end'] # (list:str) columns where peak location data is stored (['chr', 'start', 'end'])
set_index = True       # (boolean) should the adata.var index be formatted, that it matches chr:start-stop
index_from = None    # (str) should the .var index be generated from a certain column. Otherwise this is None (SnapATAC: name)

# Manually set existing QC Columns
existing_qc_columns = []

#################### Calc QC Columns ###############################
# set available number of threads
n_threads = 10

# Step 1 & 2
barcode_tag = 'CB' # set the tag where to find the barcode to use (only for .bam)

# if Step 1, 2 or both is executed specify either a BAM file or a fragments file:
fragments_file = 'test_data/scatac_pbmc_fragments.tsv' # specify file holding fragments information as a bamfile or fragments bedfile(recommended)

# Step 1: Calc metrics related to fragment length
calc_fragment_length_metrics = True # set True if the mean insertsize should be calculated

# Step 2: Promotor-enrichment
calc_promotor_enrichment = False # set True if the promotor-enrichment should be calculated
# specify promotors_gtf if another promoter than the fixed ones should be used
# if promotors is None please specify species
promoters_gtf = None
species = "human"

<hr style="border:2px solid black"> </hr>

----------------

## Import modules

In [None]:
# sctoolbox modules 
import sctoolbox.calc_overlap_pct as overlap
import peakqc.fld_scoring as fld
from sctoolbox.qc_filter import *
from sctoolbox.atac_utils import *
import sctoolbox.utils as utils

utils.settings_from_config("config.yaml", key="01")

<hr style="border:2px solid black"> </hr>

## 1. Read in data

### Option 1: Read from .h5ad

In [None]:
if from_h5ad:
    
    adata = utils.load_h5ad(h5ad_file)
    adata = utils.prepare_atac_anndata(adata,
                       coordinate_cols=coordinate_cols,
                       set_index=set_index,
                       index_from=index_from)

### Inspect adata

In [None]:
display(adata)

In [None]:
adata.var.head()

<hr style="border:2px solid black"> </hr>

------------

## 2. Calculate QC columns related to fragments

### Check barcode tag

In [None]:
use_bam = fragments_file.endswith("bam")
if use_bam:
    check_barcode_tag(adata, fragments_file, barcode_tag)

### 2.1. Calc mean insertsize and count if missing 

In [None]:
if calc_fragment_length_metrics:

    fld.add_fld_metrics(adata=adata,
                        fragments=fragments_file,
                        barcode_col=None,
                        barcode_tag="CB",
                        chunk_size_bam=1000000,
                        regions=None,
                        peaks_thr=10,
                        wavelength=150,
                        sigma=0.4,
                        plot=False,
                        save_density=None,
                        save_overview=None,
                        sample=0)

    adata.obs

In [None]:
adata.obs

### 2.2 Promotor enrichment 

In [None]:
if calc_promotor_enrichment:
    
    if use_bam:
        overlap.pct_fragments_overlap(adata, regions_file=promoters_gtf, bam_file=fragments_file, cb_col=None,
                                  cb_tag='CB', regions_name='promoters', nproc=1, sort_bam=False, sort_regions=False, keep_fragments=False)
    else:
        overlap.pct_fragments_overlap(adata, regions_file=promoters_gtf, fragments_file=fragments_file, cb_col=None,
                                  cb_tag='CB', regions_name='promoters', nproc=1, sort_bam=False, sort_regions=False, keep_fragments=False)

    if calc_fragment_length_metrics:
        adata.obs.pop('n_total_fragments')
    else:
        adata.obs.rename(columns={'n_total_fragments': 'genome_counts'}, inplace=True)
        qc_columns['genome_counts'] = 'genome_counts'
        #build_legend(adata, 'genome_counts', 'genome_counts')
        

-------------

## Save adata to .h5ad

In [None]:
#Overview of loaded adata
display(adata)

In [None]:
#Saving the data
adata_output = "anndata_1.h5ad"
utils.save_h5ad(adata, adata_output)