# Assembling

This notebook is used to assemble a processable h5ad object for subsequent notebooks.

## Fill in input data, output and settings

In [13]:
####################### TEST NAME ###################################
test = 'Aorta'
################### DEFINING STRATEGY ###############################
from_h5ad = True #option 1
assembling_10_velocity = False #option 2
assembling_10_public = False #option 3
convert_seurat = False #option 4

###################### INPUT DATA ###################################

#For option 1: The path to an existing .h5ad file
h5ad_path = '/home/rstudio/data/anndata/cropped_146.h5ad'
#################### Calc QC Columns ###############################
n_threads = 8

fragments_file = '/home/rstudio/data/bamfiles/fragments_cropped_146.bed'
bam_file = '/home/rstudio/data/bamfiles/sorted_cropped_146.bam'
#promoters_gtf =  '/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.promoters2000.gtf'
promoters_gtf = '/home/rstudio/data/homo_sapiens.104.promoters2000.gtf'


##################### OUTPUT DATA ###################################
output_dir = '/home/rstudio/processed_data'

## Import modules

In [14]:
# sctoolbox modules 
import sctoolbox.atac_tree as sub_tree
import sctoolbox.creators as cr
import sctoolbox.fragment_length as fragments
import sctoolbox.calc_overlap_pct as overlap
# import episcanpy
import episcanpy as epi

## Setup path handling object 

In [15]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test



## Read in data

### Option 1: Read from .h5ad

In [16]:
if from_h5ad:
    
    adata = epi.read_h5ad(h5ad_path)
    
    #Add information to the infoprocess
    cr.build_infor(adata, "Input_for_assembling", h5ad_path)
    cr.build_infor(adata, "Strategy", "Read from h5ad")



## Inspect adata

In [17]:
display(adata)

AnnData object with n_obs × n_vars = 373 × 3830
    obs: 'barcode', 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample'
    var: 'name'
    uns: 'infoprocess', 'color_set'

In [18]:
display(adata.var)

Unnamed: 0,name
0,b'chr1':9993-10421
1,b'chr1':29089-29368
2,b'chr1':180548-180799
3,b'chr1':199689-199928
4,b'chr1':629307-629902
...,...
3825,b'chrY':318869-319190
3826,b'chrY':386933-387197
3827,b'chrY':1453156-1453412
3828,b'chrY':1591522-1591761


In [19]:
display(adata.obs)

Unnamed: 0,barcode,TN,UM,PP,UQ,CM,file,sample
1,AAACTACCAGAAACCCGAGATA,33.0,18.0,18.0,15.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB
2,AAACTACCAGAAACCTAAGTGG,52.0,36.0,35.0,32.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB
3,AAACTACCAGAAACGGATCAGT,27.0,19.0,19.0,19.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB
4,AAACTACCAGAAACGTCCCGTT,6223.0,5231.0,5213.0,3779.0,18.0,/home/rstudio/data/snap/cropped_146.snap,MB
5,AAACTACCAGAAACTAGCCCTA,41.0,29.0,29.0,26.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB
...,...,...,...,...,...,...,...,...
369,AAACTACCAGCTTGGCAACAGC,26.0,20.0,20.0,20.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB
370,AAACTACCAGCTTTAACTGCGC,29.0,20.0,20.0,19.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB
371,AAACTACCAGCTTTAGCCCTAT,32.0,24.0,24.0,24.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB
372,AAACTACCAGCTTTAGCTGACT,10437.0,9231.0,9206.0,6209.0,17.0,/home/rstudio/data/snap/cropped_146.snap,MB


## Check for QC related columns

### 1. Check if mean fragment length column exists


In [20]:
mfl_column = fragments.check_mfl(adata)
mfl_column

False

### 2. Check if percentage of reads in promoters column exists

In [21]:
pct_rip_column = overlap.check_pct_fragments_in_promoters(adata)
pct_rip_column

False

## Calculate missing columns

### 1. calc mean fragment length if missing 

In [22]:
if not mfl_column:
    adata = fragments.add_mfl_fragment(fragments_file, adata, n_threads)
adata.obs

Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample,mean_fragment_length
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAACTACCAGAAACCCGAGATA,33.0,18.0,18.0,15.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB,76.833333
AAACTACCAGAAACCTAAGTGG,52.0,36.0,35.0,32.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB,155.064516
AAACTACCAGAAACGGATCAGT,27.0,19.0,19.0,19.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB,134.562500
AAACTACCAGAAACGTCCCGTT,6223.0,5231.0,5213.0,3779.0,18.0,/home/rstudio/data/snap/cropped_146.snap,MB,140.355033
AAACTACCAGAAACTAGCCCTA,41.0,29.0,29.0,26.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB,105.727273
...,...,...,...,...,...,...,...,...
AAACTACCAGCTTGGCAACAGC,26.0,20.0,20.0,20.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB,103.733333
AAACTACCAGCTTTAACTGCGC,29.0,20.0,20.0,19.0,0.0,/home/rstudio/data/snap/cropped_146.snap,MB,107.692308
AAACTACCAGCTTTAGCCCTAT,32.0,24.0,24.0,24.0,1.0,/home/rstudio/data/snap/cropped_146.snap,MB,140.850000
AAACTACCAGCTTTAGCTGACT,10437.0,9231.0,9206.0,6209.0,17.0,/home/rstudio/data/snap/cropped_146.snap,MB,125.825601


### 2. Promotor enrichment 

In [10]:
adata.obs = adata.obs.set_index('barcode')

In [None]:
if not pct_rip_column:
    overlap.pct_fragments_in_promoters(adata, promoters_gtf, bam_file=bam_file, cb_col=None, nproc=1)
adata.obs

Converting BAM to fragments file! This may take a while...


## Inspect adata.obs

In [None]:
adata.obs

## Save adata to .h5ad

In [None]:
adata_output = tree.assembled_anndata
adata_output

In [None]:
#Saving the data
cr.build_infor(adata, "Test_number", test)
cr.build_infor(adata, "Anndata_path", output_dir)

adata_output = tree.assembled_anndata
adata.write(filename=adata_output)