# Assembling

This notebook is used to assemble a processable h5ad object for subsequent notebooks.

## Fill in input data, output and settings

In [16]:
####################### TEST NAME ###################################
test = 'Esophagus'
################### DEFINING STRATEGY ###############################
from_h5ad = True #option 1
assembling_10_velocity = False #option 2
assembling_10_public = False #option 3
convert_seurat = False #option 4

###################### INPUT DATA ###################################

#For option 1: The path to an existing .h5ad file
h5ad_path = '/home/rstudio/data/anndata/Esophagus.h5ad'
#Manually set existing QC Columns
mean_insertsize = None
insertsize_count = None
n_total_fragments = None
n_fragments_in_promoters = None
pct_fragments_in_promoters = None
blacklist_overlaps = None
#total_number_of_fragments
TN = 'TN'
#uniquely_mapped_fragments
UM = 'UM'
#properly_paired_fragments
PP = 'PP'
#uniq_fragments
UQ = 'UQ'
#chrM_fragments
CM = 'CM'

#################### Calc QC Columns ###############################
#set available number of threads
n_threads = 8

#if the mean_insertsize should be calculated specify either a BAM file or a fragments file:
fragments_file = '/home/rstudio/data/bamfiles/fragments_Esophagus.bed'
bam_file = '/home/rstudio/data/bamfiles/sorted_Esophagus.bam'

#specify promotors_gtf if another promoter than the fixed ones should be used
#if promotors is None please specify species
#promoters_gtf =  '/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.promoters2000.gtf'
promoters_gtf = '/home/rstudio/data/homo_sapiens.104.promoters2000.gtf'
species = None

##################### OUTPUT DATA ###################################
output_dir = '/home/rstudio/processed_data'

## Import modules

In [17]:
# sctoolbox modules 
import sctoolbox.atac_tree as sub_tree
import sctoolbox.creators as cr
import sctoolbox.fragment_length as fragments
import sctoolbox.atac as atac
import sctoolbox.calc_overlap_pct as overlap
from sctoolbox.qc_filter import *
from sctoolbox.atac_utils import *
# import episcanpy
import episcanpy as epi

## Setup path handling object 

In [18]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

/home/rstudio/processed_data/Esophagus/assembling: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/qc: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/norm_correction: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/clustering: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/annotation: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/complete_report: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/qc/plots: NEWLY SETUP
/home/rstudio/processed_data/Esophagus/assembling/anndata: NEWLY SETUP


## Read in data

### Option 1: Read from .h5ad

In [19]:
qc_columns = {}
qc_columns["mean_insertsize"] = mean_insertsize
qc_columns['n_total_fragments'] = n_total_fragments
qc_columns['n_fragments_in_promoters'] = n_fragments_in_promoters
qc_columns['pct_fragments_in_promoters'] = pct_fragments_in_promoters
qc_columns["blacklist_overlaps"] = blacklist_overlaps
qc_columns["TN"] = TN
qc_columns["UM"] = UM
qc_columns["PP"] = PP
qc_columns["UQ"] = UQ
qc_columns["CM"] = CM

In [20]:
if from_h5ad:
    
    adata = epi.read_h5ad(h5ad_path)
    
    #Add information to the infoprocess
    cr.build_infor(adata, "Input_for_assembling", h5ad_path)
    cr.build_infor(adata, "Strategy", "Read from h5ad")
    
    print('add existing adata.obs columns to infoprocess:')
    print()
    for key, value in qc_columns.items():
        if value is not None:
            print(key + ':' + value)
            if value in adata.obs.columns:
                build_legend(adata, key, value)
            else:
                print('column:  ' + value + ' is not in adata.obs')
    
    if not adata.obs.index.name == "barcode":
        print('setting adata.obs.index = adata.obs[barcode]')
        adata.obs = adata.obs.set_index("barcode")
    else:
        print('barcodes are already the index')
    

add existing adata.obs columns to infoprocess:

TN:TN
UM:UM
PP:PP
UQ:UQ
CM:CM
setting adata.obs.index = adata.obs[barcode]




## Inspect adata

In [21]:
display(adata)

AnnData object with n_obs × n_vars = 73652 × 150138
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample'
    var: 'name'
    uns: 'infoprocess', 'color_set', 'legend'

In [22]:
display(adata.var)

Unnamed: 0,name
0,b'chr1':10008-10690
1,b'chr1':11179-11381
2,b'chr1':28734-29439
3,b'chr1':29495-29682
4,b'chr1':34673-35029
...,...
150133,b'chrY':57203305-57203488
150134,b'chrY':57206427-57206551
150135,b'chrY':57215321-57215411
150136,b'chrY':57215521-57215837


In [23]:
display(adata.obs)

Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACTACCAGAAACCCGAGATA,33.0,18.0,18.0,15.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB
AAACTACCAGAAACCTAAGTGG,52.0,36.0,35.0,32.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB
AAACTACCAGAAACGGATCAGT,27.0,19.0,19.0,19.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB
AAACTACCAGAAACGTCCCGTT,6223.0,5232.0,5214.0,3780.0,18.0,/home/rstudio/data/snap/Esophagus.snap,MB
AAACTACCAGAAACTAGCCCTA,41.0,29.0,29.0,26.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB
...,...,...,...,...,...,...,...
TTCCATCCTCTTTCGCGTGTAA,9.0,6.0,6.0,6.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB
TTCCATCCTCTTTCTGCAGACT,10.0,6.0,6.0,6.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB
TTCCATCCTCTTTCTGGCGCAG,17.0,5.0,5.0,5.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB
TTCCATCCTCTTTGCCGGAAGG,8.0,7.0,7.0,7.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB


## Calculate missing columns

### 1. calc mean insertsize and count if missing 

In [24]:
if qc_columns['mean_insertsize'] is None or qc_columns['insertsize_count'] is None:
    atac.add_insertsize(adata, fragments=fragments_file) # or use bam=bam_file instead of fragments
    
    # update column dictionary
    qc_columns['mean_insertsize'] = 'mean_insertsize'
    qc_columns['insertsize_count'] = 'insertsize_count'
    
    # update infoprocess table
    build_legend(adata, 'mean_insertsize', 'mean_insertsize')
    build_legend(adata, 'insertsize_count', 'insertsize_count')
    
adata.obs

Counting fragment lengths from fragments file...
Done reading file - elapsed time: 0:03:49
Converting counts to dataframe...
Done getting insertsizes from fragments!
Added insertsize information to adata.obs[["insertsize_count", "mean_insertsize"]] and adata.uns["insertsize_distribution"].


Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample,insertsize_count,mean_insertsize
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACTACCAGAAACCCGAGATA,33.0,18.0,18.0,15.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,12.0,85.18
AAACTACCAGAAACCTAAGTGG,52.0,36.0,35.0,32.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,30.0,158.53
AAACTACCAGAAACGGATCAGT,27.0,19.0,19.0,19.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,16.0,162.50
AAACTACCAGAAACGTCCCGTT,6223.0,5232.0,5214.0,3780.0,18.0,/home/rstudio/data/snap/Esophagus.snap,MB,4923.0,137.92
AAACTACCAGAAACTAGCCCTA,41.0,29.0,29.0,26.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,26.0,126.11
...,...,...,...,...,...,...,...,...,...
TTCCATCCTCTTTCGCGTGTAA,9.0,6.0,6.0,6.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,6.0,117.17
TTCCATCCTCTTTCTGCAGACT,10.0,6.0,6.0,6.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,2.0,92.50
TTCCATCCTCTTTCTGGCGCAG,17.0,5.0,5.0,5.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,3.0,77.00
TTCCATCCTCTTTGCCGGAAGG,8.0,7.0,7.0,7.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,2.0,154.50


In [25]:
#adata.uns['legend']

### 2. promotor enrichment 

In [27]:
if qc_columns['n_fragments_in_promoters'] is None:
    overlap.pct_fragments_in_promoters(adata, gtf_file=promoters_gtf, species=species, bam_file=bam_file, cb_col=None, nproc=1)
    
    # n_total_fragments, n_fragments_in_promoters, pct_fragments_in_promoters
    # update column dictionary
    qc_columns['n_total_fragments'] = 'n_total_fragments'
    qc_columns['n_fragments_in_promoters'] = 'n_fragments_in_promoters'
    qc_columns['pct_fragments_in_promoters'] = 'pct_fragments_in_promoters'
    # update infoprocess table
    build_legend(adata, 'n_total_fragments', 'n_total_fragments')
    build_legend(adata, 'n_fragments_in_promoters', 'n_fragments_in_promoters')
    build_legend(adata, 'pct_fragments_in_promoters', 'pct_fragments_in_promoters')
    

## Inspect adata.obs

In [28]:
adata.obs

Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample,insertsize_count,mean_insertsize,n_total_fragments,n_fragments_in_promoters,pct_fragments_in_promoters
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACTACCAGAAACCCGAGATA,33.0,18.0,18.0,15.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,12.0,85.18,12.0,4.0,0.333333
AAACTACCAGAAACCTAAGTGG,52.0,36.0,35.0,32.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,30.0,158.53,26.0,5.0,0.192308
AAACTACCAGAAACGGATCAGT,27.0,19.0,19.0,19.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,16.0,162.50,12.0,5.0,0.416667
AAACTACCAGAAACGTCCCGTT,6223.0,5232.0,5214.0,3780.0,18.0,/home/rstudio/data/snap/Esophagus.snap,MB,4923.0,137.92,5140.0,1256.0,0.244358
AAACTACCAGAAACTAGCCCTA,41.0,29.0,29.0,26.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,26.0,126.11,24.0,6.0,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...
TTCCATCCTCTTTCGCGTGTAA,9.0,6.0,6.0,6.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,6.0,117.17,4.0,1.0,0.250000
TTCCATCCTCTTTCTGCAGACT,10.0,6.0,6.0,6.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,2.0,92.50,2.0,0.0,0.000000
TTCCATCCTCTTTCTGGCGCAG,17.0,5.0,5.0,5.0,1.0,/home/rstudio/data/snap/Esophagus.snap,MB,3.0,77.00,7.0,4.0,0.571429
TTCCATCCTCTTTGCCGGAAGG,8.0,7.0,7.0,7.0,0.0,/home/rstudio/data/snap/Esophagus.snap,MB,2.0,154.50,2.0,1.0,0.500000


## Save adata to .h5ad

In [29]:
adata_output = tree.assembled_anndata
adata_output

'/home/rstudio/processed_data/Esophagus/assembling/anndata/Esophagus.h5ad'

In [30]:
import numpy as np
import pandas as pd

In [36]:
adata.uns['insertsize_distribution'] = adata.uns['insertsize_distribution'].swapaxes('index', 'columns')

In [32]:
adata.uns['insertsize_distribution'] = adata.uns['insertsize_distribution'].reset_index(drop=True)

In [38]:
adata.uns['insertsize_distribution'].fillna(0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
AAACTACCAGAAACCCGAGATA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACTACCAGAAACCTAAGTGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACTACCAGAAACGGATCAGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACTACCAGAAACGTCCCGTT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACTACCAGAAACTAGCCCTA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCCATCCTCTTTCGCGTGTAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TTCCATCCTCTTTCTGCAGACT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TTCCATCCTCTTTCTGGCGCAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TTCCATCCTCTTTGCCGGAAGG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# FIX TO SAVE TO ADATA
adata.uns['insertsize_distribution'].columns = adata.uns['insertsize_distribution'].columns.astype(str)

In [35]:
import copy 
adata_2 = copy.deepcopy(adata)
adata_2.uns

OverloadedDict, wrapping:
	OrderedDict([('infoprocess', {'Input_for_assembling': '/home/rstudio/data/anndata/Esophagus.h5ad', 'Strategy': 'Read from h5ad', 'Test_number': 'Esophagus', 'Anndata_path': '/home/rstudio/processed_data'}), ('color_set', ['red', 'blue', 'green', 'pink', 'chartreuse', 'gray', 'yellow', 'brown', 'purple', 'orange', 'wheat', 'lightseagreen', 'cyan', 'khaki', 'cornflowerblue', 'olive', 'gainsboro', 'darkmagenta', 'slategray', 'ivory', 'darkorchid', 'papayawhip', 'paleturquoise', 'oldlace', 'orangered', 'lavenderblush', 'gold', 'seagreen', 'deepskyblue', 'lavender', 'peru', 'silver', 'midnightblue', 'antiquewhite', 'blanchedalmond', 'firebrick', 'greenyellow', 'thistle', 'powderblue', 'darkseagreen', 'darkolivegreen', 'moccasin', 'olivedrab', 'mediumseagreen', 'lightgray', 'darkgreen', 'tan', 'yellowgreen', 'peachpuff', 'cornsilk', 'darkblue', 'violet', 'cadetblue', 'palegoldenrod', 'darkturquoise', 'sienna', 'mediumorchid', 'springgreen', 'darkgoldenrod', 'magent

In [47]:
#Saving the data
cr.build_infor(adata, "Test_number", test)
cr.build_infor(adata, "Anndata_path", output_dir)

adata_output = tree.assembled_anndata
adata.write(filename=adata_output)