# Processing scATAC Data

## Load Data

### Load packages

In [1]:
import os
import numpy as np
import scanpy as sc
import pandas as pd
import anndata as ad
import episcanpy.api as epi
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix

# modules from gff_analyser/WP2
import gff_analyser.gffBuilder as gff
import gff_analyser.atacFeatureGraphs as graph
from pybedtools import BedTool
# modules from sctoolbox
from sctoolbox import annotation
import sctoolbox.atac as atac
import sctoolbox.calc_overlap_pct as overlap
from sctoolbox import celltype_annotation

ModuleNotFoundError: No module named 'gff_analyser.atacFeatureGraphs'

### Specify required paths

In [None]:
GTF_PATH = '/mnt/workspace_stud/stud3/homo_sapiens.104.mainChr.gtf'   # gtf file to use for peaks annotation
INPUT_PATH = '/mnt/workspace_stud/stud3/h5ad_objects/'  # path where the h5ad object is saved
OUTPUT_PATH = '/mnt/workspace_stud/stud3/compare_presentation/liver/'  # path where processed adata file can be save
FRAG_FILE = '/mnt/workspace_stud/catlas_ref/frag/liver_SM-A8WNZ_rep1_fragments.bed.gz' # path to fragments file
H5AD = 'colon_final.h5ad'  # name of the h5ad file
enhancer_bed = '/mnt/workspace_stud/stud3/ENCFF212UAV.bed'
blacklisted_bed = '/mnt/workspace_stud/stud3/ENCFF356LFX.bed'

### Sort the fragment file

In [None]:
FRAG_FILE_sorted = gff.sort_bed(frag_file=FRAG_FILE, OUTPUT_PATH=OUTPUT_PATH)

## Read and customize data

### Read Anndata object

In [None]:
adata = epi.read_h5ad(H5AD)

### inspect adata.var

In [None]:
adata.obs

## Save the h5ad now or later

In [None]:
adata.write_h5ad('') # Target save path

#### Adjust peak names
* if the features names are not in the format chr_start_end, reformat them to avoid issues in downstream analysis

In [None]:
names = []
for name in bdata.var_names:
    names.append(name)

for i, n in enumerate(names):
    tmp = names[i].replace(':', '_')
    tmp = tmp.replace('-', '_')
    tmp = tmp.split('_')
    names[i] = '_'.join([tmp[0], tmp[-2], tmp[-1]])

bdata.var_names = pd.Index(names)

In [None]:
names = []
for name in adata.obs_names:
    names.append(name)
name
for i, n in enumerate(names): # colon_sigmoid_SM-JF1O8_1+
    tmp = names[i].split('+')
    names[i] = tmp[-1]

adata.obs_names = pd.Index(names, name='index')

### inspect adata.obs

In [None]:
adata.obs

### Calculate mean insertsize

In [None]:
atac.add_insertsize(adata, fragments=FRAG_FILE)

In [None]:
# plot insert size
atac.plot_insertsize(adata)

## Generate Feature.gtf Files
##### Only if not already done before

In [None]:
gff.generate_feature_files(adata=adata, gtf_file=[GTF_PATH], fragments=FRAG_FILE_sorted, enhancer_bed=enhancer_bed, blacklisted_bed=blacklisted_bed, threshold=10, promoter_distance=2000, tss_distance=100, out=OUTPUT_PATH)

###  Promotor enrichment 
* To speed up calculation, use fragments file instead of a bam file and set bam_file=None
* If cell barcodes are not in index, specify column name using parameter cb_col
* Specify species: [homo_sapiens, mus_musculus, danio_rerio,...]

In [None]:
overlap.pct_fragments_in_features(adata, input_dir=OUTPUT_PATH, fragments_file=FRAG_FILE_sorted1, bam_file=None, 
                                   cb_col=None, species=None)

# Visualising the Features Data

#### Filter per Feature
* if there are too many graphs displayed, filter per Feature for a fast look. Graphs are saved even if not good portrayed in the notebook.
* Fill the list with the features of interest
* celltype_of_interetto look only for specific cell types listed inside the list

In [None]:
filtered = ['']
celltype_filtered = ['']

In [None]:
celltype_of_interest = ['']
adata_tmp = adata[adata.obs['CELLTYPE_COLUMN'].isin(celltype_of_interest)] # Enter column name where cell types are saved

#### Display the Violin Plot
* groupy cell type or other features for comparison


In [None]:
graph.violin_plots(adata_tmp, output=OUTPUT_PATH, group='cell type', filtered=filtered, multi_panel=False)

#### Scatter Matrix
* filter for single cell type possible

In [None]:
graph.scatter_plots(adata, output=OUTPUT_PATH, feature_filtered=filtered, celltype_filtered=celltype_filtered, figsize=[10,3])

#### Scatter all graphs
* if display_all is set True, list given to features will be used for y-axis. Otherwise features will get overwritten by the overall feature list, internally.

In [None]:
graph.simple_scatter(adata, output=OUTPUT_PATH ,feature=['n_fragments_in_CDS', 'n_total_fragments'], display_all=False)

#### Multiplots. Violin and Scatter
* for a comparison of two features for the group of interest.

In [None]:
graph.multi_plot(adata, feature1='pct_fragments_in_enhancer', feature2='pct_fragments_in_promotor2000', group='cell type', out=OUTPUT_PATH, celltype_filtered=['Memory B Cell'], multi_panel=False)

## QC

### Remove chrM

In [None]:
non_m = [name for name in adata.var_names if not name.startswith('chrM')]
adata = adata[:, non_m]
display(adata)

### Optional: Remove chrX and chrY

In [None]:
non_xy = [name for name in adata.var_names if not name.startswith('chrY') | name.startswith('chrX')]
adata = adata[:, non_xy]
display(adata)

### Remove cells without features or empty features

In [None]:
# remove cells with empty features
epi.pp.filter_cells(adata, min_features=1)
# remove features with no cells
epi.pp.filter_features(adata, min_cells=1)

# calculate the log of the number of features in each cell
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
adata

### Binarize matrix and save different layers

In [None]:
adata.raw = adata

In [None]:
epi.pp.binarize(adata)
adata.layers['binary'] = adata.X.copy()

## Filter Data

### Promoter enrichment

In [None]:
# plot promoter enrichment
sc.pl.violin(adata, keys = ['pct_fragments_in_promotor2000'], groupby = None, rotation=90)

In [None]:
# filter cells based on percentage of fragments in promoters
adata = adata[adata.obs['pct_fragments_in_promotor2000'] > 0.2]

### Visualize feature distribution (Histogram)

In [None]:
# show open features per cell
min_features = 100

epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
               threshold=min_features)
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
               threshold=min_features)

### Visualize feature distribution (Violin)

In [None]:
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])

### Filter cells

In [None]:
# filter cells which have at least min_features and at most max_features
epi.pp.filter_cells(adata, min_features=150)
epi.pp.filter_cells(adata, max_features=10000)

### Visualize distribution of cells sharing a feature

In [None]:
# show numbers of cells sharing features
min_cells = 10

epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
               threshold=min_cells)
epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
               threshold=min_cells)

### Filter features

In [None]:
# filter features that appear in at least min_cells and at most max_cells
epi.pp.filter_features(adata, min_cells=10)
epi.pp.filter_features(adata, max_cells=200)

### Visualize feature distribution after filtering

In [None]:
# visualize
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])

In [None]:
# show open features per cell
min_features = 100

epi.pp.coverage_cells(adata, binary=True, log=False, bins=50,
               threshold=min_features)
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50,
               threshold=min_features)

# show numbers of cells sharing features
min_cells = 10

epi.pp.coverage_features(adata, binary=True, log=False, bins=50,
               threshold=min_cells)
epi.pp.coverage_features(adata, binary=True, log=10, bins=50,
               threshold=min_cells)

# calculate varibaility score
epi.pp.cal_var(adata)

### Normalize remaining data

In [None]:
sc.pp.normalize_total(adata)
adata.layers['normalised'] = adata.X.copy()

# log-normalize
epi.pp.log1p(adata)

## Dimension reduction and clustering

### Calculate PCA and neighbors

In [None]:
# Plot PCA variance ratio for selection of PCs
sc.pl.pca_variance_ratio(adata, n_pcs = 30)

### Calculate UMAP

In [None]:
sc.tl.umap(adata, min_dist=0.1, spread=2.0)

In [None]:
# Visualize cells in UMAP
sc.pl.umap(adata, color = 'nb_features', legend_loc = 'right margin')

### Cluster with leiden algorithm and show UMAP

In [None]:
sc.tl.leiden(adata, resolution=0.2, use_weights=False)

sc.pl.umap(adata, color=['leiden'])

## Peaks Annotation

### UROPA

In [None]:
custom_config = {"queries": [{"feature": 'gene', "distance": [5000, 5000], "feature_anchor": "start"}],
                 "priority": True, 
                 "show_attributes": "all"}

In [None]:
annotation.annotate_adata(adata, gtf=GTF_PATH, config=custom_config, best=True, threads=3, coordinate_cols=None, temp_dir="", remove_temp=True, verbose=True, inplace=True)

#### filter unassigned peaks in uropa

In [None]:
assigned_peaks = adata.var[adata.var['gene_name'].notnull()]
uropa_adata = adata[:,assigned_peaks.index]
uropa_adata

### Replace peaks with gene names
Make new feature names unique and write them into raw

In [None]:
uropa_adata.var.reset_index(inplace=True)
uropa_adata.var.set_index('gene_id', inplace=True)

In [None]:
uropa_adata.var.index = uropa_adata.var.index.astype('object')

In [None]:
uropa_adata.raw

In [None]:
uropa_adata.var_names_make_unique(join="_")
uropa_adata.raw = uropa_adata.copy()
uropa_adata.var

### Rank genes

In [None]:
sc.tl.rank_genes_groups(uropa_adata, groupby='leiden', use_raw=False)

sc.pl.rank_genes_groups(uropa_adata)

In [None]:
sc.pl.rank_genes_groups_matrixplot(uropa_adata, standard_scale='var', n_genes=10)

## Celltype annotation

In [None]:
celltype_annotation.run_scsa(uropa_adata, species='human')

### Visualize with UMAP

In [None]:
# Visualize cells in UMAP
sc.pl.umap(uropa_adata, color = 'SCSA_pred_celltype', title = 'Predicted Celltypes', legend_loc = 'right margin')

#### Generate lists for different features

In [None]:
pct_features = []
n_features = []
for item in adata.obs:
    if item.startswith('n_'):
        n_features.append(item)
    if item.startswith('pct_'):
        pct_features.append(item)

### Display Plots

#### Violin Plots

In [None]:
compare_feature_to_celltypes(adata, ["pct_fragments_in_enhancer", "pct_fragments_in_promotor2000"], "cell type", max_size=1, name='Ylabel name', rotation=0)

#### Scatter Plots

In [None]:
sc.pl.scatter(adata, x='n_fragments_in_enhancer', y='n_fragments_in_promotor2000')


#### Dimension Reduction

In [None]:
graph.compare_dimensionreductions(adata, [features_of_interest], 'cell type')