In [1]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[3, 5, 8, 9, 10, 14])

nb_name = "05_tobias.ipynb"

_compare_version(nb_name)



In [2]:
import os
import glob
import sctoolbox.tools.bam as bam
import sctoolbox.tools.tobias as tobias

def prepare_tobias(adata,
                   cluster_column,
                   barcode_column,
                   path_out,
                   path_bam,
                   barcode_tag='CB',
                   path_TOBIAS_fasta=None,
                   path_TOBIAS_motifs=None,
                   path_TOBIAS_gtf=None,
                   path_TOBIAS_blacklist=None,
                   TOBIAS_organism='human',
                   TOBIAS_yml="TOBIAS_config.yml",
                   plot_comparison=True,
                   plot_correction=True,
                   plot_venn=True,
                   coverage=False,
                   wilson=False,
                   threads=4):
    """
    Split ATAC-seq bamfile by cluster and prepare TOBIAS run.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.
    cluster_column : str
        Column in adata.obs that contains the cluster information.
    barcode_column : str
        Column in adata.obs that contains the barcode information.
    path_out : str
        Path to save the output files.
    path_bam : str
        Path
    barcode_tag : str
        Tag to extract the barcode from the read name.
    path_TOBIAS_fasta : str
        Path to the organism fasta file.
    path_TOBIAS_motifs : str
        Path to the motifs file.
    path_TOBIAS_gtf : str
        Path to the organisms gtf file (genes).
    path_TOBIAS_blacklist : str
        Path to the blacklist file.
    TOBIAS_organism : str
        Organism for TOBIAS.
    TOBIAS_yml : str
        Name of TOBIAS config yml. Cannot be named "config.yml" or it will be overwritten when running TOBIAS.
    plot_comparison : bool
        TOBIAS flag for plotting comparison between condition.
    plot_correction : bool
        TOBIAS flag for plotting correction.
    plot_venn : bool
        TOBIAS flag for plotting venn diagramms.
    coverage : bool
        TOBIAS flag for coverage calculation.
    wilson : bool
        TOBIAS flag for wilson calculation.
    threads : int
        Number of threads to use.
    """

    # Check if output directory exists
    if not os.path.exists(path_out):
        raise OSError(f"The file or directory \'{path_out}\' does not exist. Please give valid input.")

    # Make the TOBIAS directory
    path_TOBIAS = os.path.join(path_out, 'TOBIAS_run')

    # Check if directory for TOBIAS run exists, if not create it
    if os.path.exists(path_TOBIAS):
        print(f"WARNING: The directory \'{path_TOBIAS}\' already exists. Any files in this directory may be overwritten, which can cause inconsistencies.")
    else:
        os.mkdir(path_TOBIAS)

    # Get path for TOBIAS input and create directory
    path_TOBIAS_in = os.path.join(path_TOBIAS, "input", "")

    if not os.path.exists(path_TOBIAS_in):
        os.mkdir(path_TOBIAS_in)

    # Get path for TOBIAS output and create directory
    path_TOBIAS_out = os.path.join(path_TOBIAS, "output")

    if not os.path.exists(path_TOBIAS_out):
        os.mkdir(path_TOBIAS_out)

    # check if the correct barcode tag is used
    bam.check_barcode_tag(adata, path_bam, barcode_tag)

    # Prepare splitting of the bam file if necessary
    if cluster_column not in adata.obs.columns:
        raise ValueError(f"Column \'{cluster_column}\' not found in adata.obs. Please give valid input.")

    if barcode_column not in adata.obs.columns:
        raise ValueError(f"Column \'{barcode_column}\' not found in adata.obs. Please give valid input.")

    if len(adata.obs[cluster_column].unique()) == 1:
        split_bam = False
    else:
        split_bam = True

    # Split bam file by cluster
    if split_bam:

        if threads > 1:
            parallel = True
        else:
            parallel = False

        bam.split_bam_clusters(adata,
                               bams=path_bam,
                               groupby=cluster_column,
                               barcode_col=barcode_column,
                               read_tag=barcode_tag,
                               output_prefix=path_TOBIAS_in,
                               reader_threads=threads,
                               writer_threads=threads,
                               parallel=parallel,
                               pysam_threads=threads)

    # Get paths to bam files for TOBIAS run
    path_TOBIAS_pseudobams = glob.glob("".join([path_TOBIAS_in, "*.bam"]))
    path_TOBIAS_pseudobams = [os.path.abspath(f) for f in path_TOBIAS_pseudobams]

    # If there is no blacklist file given, create a mock blacklist file
    if path_TOBIAS_blacklist is None:
        path_TOBIAS_blacklist = os.path.join(path_TOBIAS_out, "blacklist.bed")
        f = open(path_TOBIAS_blacklist, "w")
        f.write("chr1\t0\t1\n")
        f.close()

    print("Writing TOBIAS config yaml.")

    # Call function to write TOBIAS config yml
    tobias.write_TOBIAS_config(os.path.join(path_TOBIAS_out, TOBIAS_yml),
                               bams=path_TOBIAS_pseudobams,
                               fasta=path_TOBIAS_fasta,
                               gtf=path_TOBIAS_gtf,
                               motifs=path_TOBIAS_motifs,
                               blacklist=path_TOBIAS_blacklist,
                               organism=TOBIAS_organism,
                               output=path_TOBIAS_out,
                               plot_comparison=plot_comparison,
                               plot_correction=plot_correction,
                               plot_venn=plot_venn,
                               coverage=coverage,
                               wilson=wilson)

#  05 - TOBIAS
<hr style="border:2px solid black"> </hr>

## 1 - Description
This notebook provides an ATAC-seq based footprinting analysis, to infer transcription factor binding sites. Herefore TOBIAS is embedded to the notebook by customized wrapper functions. TOBIAS provides a collection of command-line bioinformatics tools to perform footprinting analysis including tools for:
- Correction of Tn5 insertion bias
- Calculation of footprint scores within regulatory regions
- Estimation of bound/unbound transcription factor binding sites
- Visualization of footprints within and across different conditions

### 1.1 Embedding
Embeddings are dimension reduction methods to transform high-dimensional data into lower-dimensional representations while preserving the inherent structure and relationships between individual cells.  
The sctoolbox supports the [Uniform Manifold Approximation and Projection (UMAP)](https://arxiv.org/abs/1802.03426) and the [t-distributed stochastic neighbor embedding (t-SNE)](https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) methods for dimension reduction, with UMAP being set as the default value.
To learn more about the differences between those methods and get more insight in the parameter selction have a look [here for umap](https://pair-code.github.io/understanding-umap/) and [here for t-SNE](https://distill.pub/2016/misread-tsne/?_ga=2.135835192.888864733.1531353600-1779571267.1531353600).
### 1.2 Clustering
Single cell clustering is used to group individual cells into clusters based on similarities in their gene expression. The clustering allows to identify distinct cell types and characterize cellular heterogeneity within a population.
The sctoolbox supports the [leiden](https://www.nature.com/articles/s41598-019-41695-z) and the [louvain](https://iopscience.iop.org/article/10.1088/1742-5468/2008/10/P10008) clustering methods, with the leiden clustering algorithm being newer and recommended to use.

In [3]:
import pandas as pd

pd.set_option('display.max_columns', None)  # no limit to the number of columns shown
from sctoolbox import settings
import sctoolbox.utils as utils
import sctoolbox.tools as tools
import sctoolbox.utils.decorator as deco

In [4]:
# In/output paths
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/"
settings.figure_dir = "../figures/tobias/"
settings.log_file: "../logs/tobias_analysis_log.txt"

# Input/Output
last_notebook_adata = "anndata_4.h5ad"
output = "anndata_tobias.h5ad"
plot_suffix = "scanpro"

Created directory: ../adatas/
Created directory: ../figures/proportion_analysis/


In [5]:
adata = utils.adata.load_h5ad(last_notebook_adata)

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)

NameError: name 'last_notebook_adata' is not defined

In [None]:
# The path to the ATAC experiment BAM file
# Must contain all cells
path_bam = ""

# The path to the FASTA file for the organism
path_TOBIAS_fasta = ""

# The path to the (uncompressed) GTF file for the organism
path_TOBIAS_gtf = ""

# The path to the blacklist file to use in the TOBIAS run (optional)
# If 'None', a mock blacklist file will be generated for the run
path_TOBIAS_blacklist = None

# The Path to the motif file for the organism (JASPAR or MEME)
path_TOBIAS_motifs = ""

# If the ATAC modality already has a column with read tags matching the tags used in the bam file
# give the name of the column here
# Must match "ATAC:<name of coulmn in anndata.obs table>"
bam_barcodes = None

# If bam_barcodes is None give the name of the column that contains the raw ATAC barcodes
# Must match "ATAC:<name of column in anndata.obs table>"
raw_barcodes_ATAC = ""

# Name of the organism from which the data stems
# options = ["mouse", "human", "zebrafish"]
TOBIAS_organism = ""

# Give the name of the TOBIAS config yaml file in the format of "<name of file>.yml"
# It cannot be 'config.yml'
TOBIAS_yml = ""

# Give the path to the TOBIAS snakemake repo directory
path_TOBIAS_snakemake = ""

In [None]:
prepare_tobias(adata,
                   cluster_column,
                   barcode_column,
                   path_out,
                   path_bam,
                   barcode_tag='CB',
                   path_TOBIAS_fasta=None,
                   path_TOBIAS_motifs=None,
                   path_TOBIAS_gtf=None,
                   path_TOBIAS_blacklist=None,
                   TOBIAS_organism='human',
                   TOBIAS_yml="TOBIAS_config.yml",
                   plot_comparison=True,
                   plot_correction=True,
                   plot_venn=True,
                   coverage=False,
                   wilson=False,
                   threads=4)