# Assembling

This notebook is used to assemble a processable h5ad object for subsequent notebooks.

## Fill in input data, output and settings

In [8]:
####################### TEST NAME ###################################
test = 'lung'
################### DEFINING STRATEGY ###############################
from_h5ad = True #option 1
assembling_10_velocity = False #option 2
assembling_10_public = False #option 3
convert_seurat = False #option 4

###################### INPUT DATA ###################################

#For option 1: The path to an existing .h5ad file
h5ad_files = ['/home/rstudio/data/anndata/ENC-1JKYN-179-SM-ACCPU_snATAC_upper_lobe_of_left_lung_Rep1.h5ad']
#Manually set existing QC Columns
n_features_by_counts = None
log1p_n_features_by_counts = None
total_counts = None
log1p_total_counts = None
mean_insertsize = None
insertsize_count = None
n_total_fragments = None
n_fragments_in_promoters = None
pct_fragments_in_promoters = None
blacklist_overlaps = None
#total_number_of_fragments
TN = 'TN'
#uniquely_mapped_fragments
UM = 'UM'
#properly_paired_fragments
PP = 'PP'
#uniq_fragments
UQ = 'UQ'
#chrM_fragments
CM = 'CM'

#################### Calc QC Columns ###############################
#set available number of threads
n_threads = 8

#if the mean_insertsize should be calculated specify either a BAM file or a fragments file:
fragments_file = '/home/rstudio/data/bamfiles/fragments_Esophagus.bed'
bam_file = '/home/rstudio/data/bamfiles/ENC-1K2DA-044-SM-A62E9_snATAC_upper_lobe_of_left_lung_Rep1.bam'

#specify promotors_gtf if another promoter than the fixed ones should be used
#if promotors is None please specify species
#promoters_gtf =  '/mnt/flatfiles/organisms/new_organism/homo_sapiens/104/homo_sapiens.104.promoters2000.gtf'
promoters_gtf = '/home/rstudio/data/homo_sapiens.104.promoters2000.gtf'
species = None

##################### OUTPUT DATA ###################################
output_dir = '/home/rstudio/processed_data'

## Import modules

In [9]:
# sctoolbox modules 
import sctoolbox.atac_tree as sub_tree
import sctoolbox.creators as cr
import sctoolbox.fragment_length as fragments
import sctoolbox.atac as atac
import sctoolbox.calc_overlap_pct as overlap
import sctoolbox.analyser 
from sctoolbox.qc_filter import *
from sctoolbox.atac_utils import *
# import episcanpy
import episcanpy as epi

In [10]:
#adata = epi.read_h5ad(h5ad_files[0])

In [11]:
#adata.var

## Setup path handling object 

In [12]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

all directories existing
all directories existing


## Read in data

### Option 1: Read from .h5ad

In [13]:
qc_columns = {}
qc_columns["n_features_by_counts"] = n_features_by_counts
qc_columns["log1p_n_features_by_counts"] = log1p_n_features_by_counts
qc_columns["total_counts"] = total_counts
qc_columns["log1p_total_counts"] = log1p_total_counts
qc_columns["mean_insertsize"] = mean_insertsize
qc_columns['n_total_fragments'] = n_total_fragments
qc_columns['n_fragments_in_promoters'] = n_fragments_in_promoters
qc_columns['pct_fragments_in_promoters'] = pct_fragments_in_promoters
qc_columns["blacklist_overlaps"] = blacklist_overlaps
qc_columns["TN"] = TN
qc_columns["UM"] = UM
qc_columns["PP"] = PP
qc_columns["UQ"] = UQ
qc_columns["CM"] = CM

In [14]:
if from_h5ad:
    
    adata = assemble_from_h5ad(h5ad_files=h5ad_files, atac=False, qc_columns=qc_columns)

add existing adata.obs columns to infoprocess:

TN:TN
UM:UM
PP:PP
UQ:UQ
CM:CM
setting adata.obs.index = adata.obs[barcode]


## Inspect adata

In [15]:
display(adata)

AnnData object with n_obs × n_vars = 73623 × 46782
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample'
    uns: 'infoprocess', 'color_set', 'legend'

In [16]:
adata.var

0
1
2
3
4
...
46777
46778
46779
46780
46781


In [17]:
display(adata.var)

0
1
2
3
4
...
46777
46778
46779
46780
46781


In [18]:
display(adata.obs)

Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AACGAGAGCTAAACCCGAGATA,36.0,26.0,26.0,17.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
AACGAGAGCTAAACCTAAGTGG,25.0,14.0,14.0,11.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
AACGAGAGCTAAACGGATCAGT,33.0,25.0,25.0,14.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
AACGAGAGCTAAACGTCCCGTT,37.0,34.0,33.0,20.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
AACGAGAGCTAAACTAGCCCTA,58.0,50.0,50.0,34.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
...,...,...,...,...,...,...,...
TTTGGCGCTTTTTCGCGTGTAA,49.0,24.0,24.0,19.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
TTTGGCGCTTTTTCTGCAGACT,16.0,9.0,9.0,7.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
TTTGGCGCTTTTTCTGGCGCAG,1607.0,1438.0,1435.0,754.0,70.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1
TTTGGCGCTTTTTGCCGGAAGG,14418.0,12956.0,12934.0,6868.0,348.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1


## Calculate missing columns

### 1. calculate scanpy qc_metrics (n_features)

In [19]:
if qc_columns['n_features_by_counts'] is None or qc_columns['log1p_n_features_by_counts'] is None:
    adata = analyser.calculate_qc_metrics(adata, var_type='features')
    
    qc_columns['n_features_by_counts'] = 'n_features_by_counts'
    qc_columns['log1p_n_features_by_counts'] = 'log1p_n_features_by_counts'
    qc_columns['total_counts'] = 'total_counts'
    qc_columns['log1p_total_counts'] = 'log1p_total_counts'
    
    build_legend(adata, 'n_features_by_counts', 'n_features_by_counts')
    build_legend(adata, 'log1p_n_features_by_counts', 'log1p_n_features_by_counts')
    build_legend(adata, 'total_counts', 'total_counts')
    build_legend(adata, 'log1p_total_counts', 'log1p_total_counts')

adata.obs

Unnamed: 0_level_0,TN,UM,PP,UQ,CM,file,sample,n_features_by_counts,log1p_n_features_by_counts,total_counts,log1p_total_counts
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AACGAGAGCTAAACCCGAGATA,36.0,26.0,26.0,17.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,4,1.609438,4.0,1.609438
AACGAGAGCTAAACCTAAGTGG,25.0,14.0,14.0,11.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,2,1.098612,2.0,1.098612
AACGAGAGCTAAACGGATCAGT,33.0,25.0,25.0,14.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,0,0.000000,0.0,0.000000
AACGAGAGCTAAACGTCCCGTT,37.0,34.0,33.0,20.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,1,0.693147,1.0,0.693147
AACGAGAGCTAAACTAGCCCTA,58.0,50.0,50.0,34.0,1.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
TTTGGCGCTTTTTCGCGTGTAA,49.0,24.0,24.0,19.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,0,0.000000,0.0,0.000000
TTTGGCGCTTTTTCTGCAGACT,16.0,9.0,9.0,7.0,0.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,1,0.693147,1.0,0.693147
TTTGGCGCTTTTTCTGGCGCAG,1607.0,1438.0,1435.0,754.0,70.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,89,4.499810,130.0,4.875197
TTTGGCGCTTTTTGCCGGAAGG,14418.0,12956.0,12934.0,6868.0,348.0,/home/rstudio/data/anndata/ENC-1JKYN-179-SM-AC...,sample1,386,5.958425,541.0,6.295266


### 2. calc mean insertsize and count if missing 

In [20]:
if qc_columns['mean_insertsize'] is None or qc_columns['insertsize_count'] is None:
    atac.add_insertsize(adata, fragments=fragments_file) # or use bam=bam_file instead of fragments
    
    # update column dictionary
    qc_columns['mean_insertsize'] = 'mean_insertsize'
    qc_columns['insertsize_count'] = 'insertsize_count'
    
    # update infoprocess table
    build_legend(adata, 'mean_insertsize', 'mean_insertsize')
    build_legend(adata, 'insertsize_count', 'insertsize_count')
    
adata.obs

Counting fragment lengths from fragments file...
Done reading file - elapsed time: 0:01:39
Converting counts to dataframe...


KeyError: "None of [Index(['insertsize_count', 'mean_insertsize'], dtype='object')] are in the [columns]"

In [None]:
atac.plot_insertsize(adata, barcodes=None)

### 3. promotor enrichment 

In [None]:
if qc_columns['n_fragments_in_promoters'] is None:
    overlap.pct_fragments_in_promoters(adata, gtf_file=promoters_gtf, species=species, bam_file=bam_file, cb_col=None, nproc=1)
    
    # n_total_fragments, n_fragments_in_promoters, pct_fragments_in_promoters
    # update column dictionary
    qc_columns['n_total_fragments'] = 'n_total_fragments'
    qc_columns['n_fragments_in_promoters'] = 'n_fragments_in_promoters'
    qc_columns['pct_fragments_in_promoters'] = 'pct_fragments_in_promoters'
    # update infoprocess table
    build_legend(adata, 'n_total_fragments', 'n_total_fragments')
    build_legend(adata, 'n_fragments_in_promoters', 'n_fragments_in_promoters')
    build_legend(adata, 'pct_fragments_in_promoters', 'pct_fragments_in_promoters')
    

## Inspect adata.obs

In [None]:
adata.obs

## Save adata to .h5ad

In [None]:
adata_output = tree.assembled_anndata
adata_output

In [None]:
#Saving the data
cr.build_infor(adata, "Test_number", test)
cr.build_infor(adata, "Anndata_path", output_dir)

adata_output = tree.assembled_anndata
adata.write(filename=adata_output)