In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 6, 9])

nb_name = "01_assembling_anndata.ipynb"

_compare_version(nb_name)

# 01 - Assembling or loading anndata object
<hr style="border:2px solid black"> </hr>

## 1 - Description
This notebook is dedicated to loading or creating an anndata object suitable for the subsequent analysis pipeline. The anndata object is prepared and finally stored as a `.h5ad` file. Based on the available data files there are multiple options to create the anndata object. To satisfy all and especially ATAC-related functionalities indexes are prepared to hold barcodes and feature coordinates.

### Available options:
#### 1. `.h5ad` file: 
Choose this option if you have one or more `.h5ad` file(s). The file could be provided by a preprocessing pipeline, a public dataset or a preceeding analysis.

#### 2. Convert from R object:
This option should be used if the data was processed using R. This can either be a `.rds` or `.robj` file.
   
#### 3. .mtx, barcode.tsv, [regions.tsv]
Choose this option if you have the count matrix in `.mtx` format, a file containing the barcodes (`*_barcodes.tsv`) and an optional file containing the regions (`*_regions.tsv`). Use this option for cases with the aforementioned three files available e.g. from a public dataset.

___________

## 2 - Setup

In [None]:
# sctoolbox modules 
import sctoolbox
import sctoolbox.utils as utils

sctoolbox.settings.settings_from_config("config.yaml", key="01")

___

## 3 - Read in data
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Choose one option

# For option 1: The path to an existing .h5ad file.
# A list or dict for multiple files e.g. {"rep_1": "1.h5ad", "rep_2": "2.h5ad"}.
path_h5ad = "test_data/scatac_pbmc.h5ad"

# For option 2: This is the path to the R-related (.rds, .robj) file
path_rds = ""

# For option 3: Directory containing .mtx, barcodes.tsv and optionally regions.tsv
path_mtx = ""

----------------

In [None]:
if sum(map(lambda x: x != "", [path_h5ad, path_rds, path_mtx])) != 1:
    del path_h5ad, path_mtx, path_rds
    raise ValueError("Please set only one of the above variables. Adjust the cell above and re-run.")

### 3.1 - Option 1: Read from h5ad

In [None]:
if path_h5ad:
    adata = utils.assemblers.from_h5ad(path_h5ad)

___

### 3.2 - Option 2: Convert from Seurat to anndata object

In [None]:
# Converting from Seurat to anndata object
if path_rds:
    adata = utils.assemblers.convertToAdata(file=path_rds)

____

### 3.3 - Option 3: .mtx, barcode.tsv, [regions.tsv]

In [None]:
# adjust in case of different naming schemes for any of the input files

mtx = '*matrix.mtx*'  # pattern for the file that contains counts
barcodes = '*barcodes.tsv*'  # pattern for the file that contains barcode information
variables = '*regions.tsv*'  # pattern for the optional file that contains variable information

In [None]:
if path_mtx:
    adata = utils.assemblers.from_mtx(path_mtx, mtx=mtx, barcodes=barcodes, variables=variables, var_error=False)

____

## 4 - Prepare anndata
<hr style="border:2px solid black"> </hr>

Rename or remove observation (`.obs`) and variable (`.var`) columns as needed and format their indices. After this step the index of `.var` holds the feature coordinates and `.obs` the cell barcodes.

In [None]:
import pandas as pd

with pd.option_context('display.max_rows', 5,'display.max_columns', None):
    display(adata.obs)
    display(adata.var)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
## 1. Modify existing columns

# .obs column names that should be deleted
drop_obs = []

# .obs column names that should be changed. E.g. "old_name": "New Name"
rename_obs = {}

# .var column names that should be deleted
drop_var = []

# .var column names that should be changed. E.g. "old_name": "New Name"
rename_var = {}


## 2. ATAC specific anndata properties
# The following settings are used to format the index and coordinate columns 

# Column name(s) of adata.var containing peak location data.
# Either a single column (str) or a list of three columns (['chr', 'start', 'end']).
coordinate_cols = ['chr', 'start', 'end']

# when formatting the index, should the prefix be removed
remove_var_index_prefix = True

# provide a name to save the original index, if None it will be overwritten
keep_original_index = None

# regex to format the index
coordinate_regex = r"chr[0-9XYM]+[\_\:\-]+[0-9]+[\_\:\-]+[0-9]+"

_________

### 4.1 - Rename and delete columns 

In [None]:
# change obs
obs = adata.obs.copy()

obs.drop(columns=drop_obs, inplace=True)
obs.rename(columns=rename_obs, errors='raise', inplace=True)

# change var
var = adata.var.copy()

var.drop(columns=drop_var, inplace=True)
var.rename(columns=rename_var, errors='raise', inplace=True)

# apply changes to adata
adata.obs = obs
adata.var = var

___

### 4.2 - Format anndata indices
Set the cell barcode as `.obs` index and peak location as `.var` index.

In [None]:
adata = utils.assemblers.prepare_atac_anndata(adata,
                                              coordinate_cols=coordinate_cols,
                                              h5ad_path=path_h5ad,
                                              remove_var_index_prefix=remove_var_index_prefix,
                                              keep_original_index=keep_original_index,
                                              coordinate_regex=coordinate_regex)

________

## 5 - Saving the anndata object

In [None]:
# Overview of loaded adata
display(adata)

In [None]:
# Saving the data
adata_output = "anndata_1.h5ad"
utils.adata.save_h5ad(adata, adata_output)

In [None]:
sctoolbox.settings.close_logfile()