In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 6, 9])

nb_name = "01_assembling_anndata.ipynb"

_compare_version(nb_name)

# 01 - Assembling or loading anndata object
<hr style="border:2px solid black"> </hr>

## 1 - Description
This notebook is dedicated to loading or creating an anndata object suitable for the subsequent analysis pipeline. The anndata object is prepared and finally stored as a `.h5ad` file. Based on the available data files there are multiple options to create the anndata object. To satisfy all and especially ATAC-related functionalities indexes are prepared to hold barcodes and feature coordinates.

### Available options:
#### 1. `.h5ad` file: 
Choose this option if you have one or more `.h5ad` file(s). The file could be provided by a preprocessing pipeline, a public dataset or a preceeding analysis.

#### 2. Convert from R object:
This option should be used if the data was processed using R. This can either be a `.rds` or `.robj` file.
   
#### 3. .mtx, barcode.tsv, [regions.tsv]
Choose this option if you have the count matrix in `.mtx` format, a file containing the barcodes (`*_barcodes.tsv`) and an optional file containing the regions (`*_regions.tsv`). Use this option for cases with the aforementioned three files available e.g. from a public dataset.

___________

## 2 - Setup

In [None]:
# sctoolbox modules 
import sctoolbox
import sctoolbox.utils as utils

sctoolbox.settings.settings_from_config("config.yaml", key="01")

___

## 3 - Read in data
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Choose one option

# For option 1: The path to an existing .h5ad file.
# A list or dict for multiple files e.g. {"rep_1": "1.h5ad", "rep_2": "2.h5ad"}.
path_h5ad = ""

# For option 2: This is the path to the R-related (.rds, .robj) file
path_rds = ""

# For option 3: Directory containing .mtx, barcodes.tsv and optionally regions.tsv
path_mtx = "/mnt/workspace_stud/stud4/stud4/atac_analysis/adatas/anndata_0"

----------------

In [None]:
#if sum(map(lambda x: x != "", [path_h5ad, path_rds])) != 1:
    #del path_h5ad, path_mtx, path_rds
    #raise ValueError("Please set only one of the above variables. Adjust the cell above and re-run.")

### 3.1 - Option 1: Read from h5ad

In [None]:
if path_h5ad:
    adata = utils.assemblers.from_h5ad(path_h5ad)

___

### 3.2 - Option 2: Convert from Seurat to anndata object

In [None]:
# Converting from Seurat to anndata object
if path_rds:
    adata = utils.assemblers.convertToAdata(file=path_rds)

____

### 3.3 - Option 3: .mtx, barcode.tsv, [regions.tsv]

In [None]:
# adjust in case of different naming schemes for any of the input files

mtx = '*matrix.mtx*'  # pattern for the file that contains counts
barcodes = '*barcodes.tsv*'  # pattern for the file that contains barcode information
variables = '*regions.tsv*'  # pattern for the optional file that contains variable information

In [None]:
if path_mtx:
    adata = utils.assemblers.from_mtx(path_mtx, mtx=mtx, barcodes=barcodes, variables=variables, var_error=False)

____

## 4 - Prepare anndata
<hr style="border:2px solid black"> </hr>

Rename or remove observation (`.obs`) and variable (`.var`) columns as needed and format their indices. After this step the index of `.var` holds the feature coordinates and `.obs` the cell barcodes.

In [None]:
import pandas as pd

with pd.option_context('display.max_rows', 5,'display.max_columns', None):
    display(adata.obs)
    display(adata.var)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
## 1. Modify existing columns

# .obs column names that should be deleted
drop_obs = ["33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", 
            "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", 
            "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", 
            "69", "70", "71", "72", "73", "74", "75", "76"]

# .obs column names that should be changed. E.g. "old_name": "New Name"
rename_obs = {
    "1": "TubeID",
    "2": "meta TubeID",
    "3": "meta pop export_psn",
    "4": "meta pop visit_1_date",
    "5": "meta pop visit_2_date",
    "6": "meta pop visit_3_date",
    "7": "meta pop visit_4_date",
    "8": "meta pop age",
    "9": "meta pop age_dec",
    "10": "meta pop sex",
    "11": "meta pop c19_severity",
    "12": "meta pop cohort",
    "13": "meta suep_hap export_psn",
    "14": "meta suep_hap baseline_date",
    "15": "meta suep_hap end_acute_date",
    "16": "meta suep_hap end_acute_visit_date",
    "17": "meta suep_hap m3_fu_date",
    "18": "meta suep_hap m6_fu_date",
    "19": "meta suep_hap m12_fu_date",
    "20": "meta suep_hap m24_fu_date",
    "21": "meta suep_hap age",
    "22": "meta suep_hap age_dec",
    "23": "meta suep_hap sex",
    "24": "meta suep_hap c19_severity",
    "25": "meta suep_hap cohort",
    "26": "meta export_psn",
    "27": "meta age",
    "28": "meta age_dec",
    "29": "meta sex",
    "30": "meta c19_severity",
    "31": "meta cohort",
    "32": "atac Tube ID",
    
    "77": "atac export_psn",
    "78": "atac run_path",
    "79": "atac fastq1",
    "80": "atac fastq2",
    "81": "atac bam",
    "82": "atac bw",
    "83": "atac project",
    "84": "atac run",
    "85": "atac plate",
    "86": "atac organism",
    "87": "atac assembly",
    "88": "atac release",
    "89": "atac star parameters",
    "90": "atac reads processed",
    "91": "atac reads with exactly 1 alignment",
    "92": "atac reads with multiple alignments",
    "93": "atac reads with no alignment",
    "94": "atac reads aligned",
    "95": "atac % reads aligned",
    "96": "atac duplicate reads",
    "97": "atac % duplicate reads",
    "98": "atac mitochondrial reads",
    "99": "atac % mitochondrial reads",
    "100": "atac rrna subunit reads",
    "101": "atac % rrna subunit reads",
    "102": "atac pcr bottleneck coefficient",
    "103": "atac mean fragment size (only for paired-end)",
    "104": "atac reads used for further steps (after optional filters for multimap, duplicate, mitochondria, rrna)",
    "105": "atac # peaks raw",
    "106": "atac # peaks filtered",
    "107": "atac factor reads mapped (tpm)",
    "108": "atac factor reads mapped",
    "109": "atac factor reads in peaks",
    "110": "atac factor used for matrix",
    "111": "atac fraction of reads in peaks (frip)",
    "112": "atac usable sample mapped reads over 2000000 peaks > 1000",
    "113": "atac sample id original",
    "114": "atac plate id original"
}

# .var column names that should be deleted
drop_var = []

# .var column names that should be changed. E.g. "old_name": "New Name"
rename_var = {
    "1": "Peak chromosome",
    "2": "Peak Start",
    "3": "Peak stop",
    "4": "Gene chromosome",
    "5": "Gene start",
    "6": "Gene stop",
    "7": "Gene strand",
    "8": "Gene id",
    "9": "Gene name",
    "10": "Gene type",
    "11": "UniProt proteins",
    "12": "UniProt genes",
    "13": "UniProt accessions",
    "14": "UniProt names",
    "15": "UniProt Ensembl transcripts",
    "16": "UniProt Ensembl proteins",
    "17": "UniProt Ensembl gene ids",
    "18": "KEGG PATHWAY terms",
    "19": "KEGG PATHWAY ids",
    "20": "Gene Ontology terms",
    "21": "Gene Ontology ids",
    "22": "Distance of peakcenter to TSS",
    "23": "Fraction of peak overlapping genebody"
}

## 2. ATAC specific anndata properties

# The following settings are used to format the index and coordinate columns

# Column name(s) of adata.var containing peak location data.

# Either a single column (str) or a list of three columns (['chr', 'start', 'end']).
coordinate_cols = ['Peak chromosome', 'Peak Start', 'Peak stop']

# when formatting the index, should the prefix be removed
remove_var_index_prefix = True

# provide a name to save the original index, if None it will be overwritten
keep_original_index = None

# regex to format the index
coordinate_regex = r"chr[0-9XYM]+[\_\:\-]+[0-9]+[\_\:\-]+[0-9]+"

_________

### 4.1 - Rename and delete columns 

In [None]:
# change obs
obs = adata.obs.copy()

obs.drop(columns=drop_obs, inplace=True)
obs.rename(columns=rename_obs, errors='raise', inplace=True)

# change var
var = adata.var.copy()

var.drop(columns=drop_var, inplace=True)
var.rename(columns=rename_var, errors='raise', inplace=True)

# apply changes to adata
adata.obs = obs
adata.var = var

___

### 4.2 - Format anndata indices
Set the cell barcode as `.obs` index and peak location as `.var` index.

In [None]:
adata = utils.assemblers.prepare_atac_anndata(adata,
                                              coordinate_cols=coordinate_cols,
                                              h5ad_path=path_h5ad,
                                              remove_var_index_prefix=remove_var_index_prefix,
                                              keep_original_index=keep_original_index,
                                              coordinate_regex=coordinate_regex)

________

## 5 - Saving the anndata object

In [None]:
# Overview of loaded adata
display(adata)

In [None]:
# Saving the data
adata_output = "anndata_1.h5ad"
utils.adata.save_h5ad(adata, adata_output)

In [None]:
sctoolbox.settings.close_logfile()