In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[2, 5])

nb_name = "prepare_for_cellxgene.ipynb"

_compare_version(nb_name)

# Preparing adata for cellxgene / MaMPlan creation
<hr style="border:2px solid black"> </hr>

## 1 - Description
### 1.1 - Preparing for cellxgene
This Notebook prepares the anndata object for cellxgene.
This preparation includes:
 - Removing unnessesary data to keep the resulting h5ad file as small as possible
 - Renaming columns for a nicer presentation in cellxgene
 - Converting unsupported datatypes to supported datatypes
 - Additional fixes for bugs between scanpy, anndata and cellxgene
   
### 1.2 - MaMPlan creation
Additionally, a MaMPlan can be created which is needed to deploy the dataset to the BCU repository using mampok or the BCU repository overlay.  
A MaMPlan acts as the config file for each specific dataset. It holds a variety of different parameters needed by mampok and the BCU repository.  
To simplyfy the creation process only the importent parameters can be set. The other parameters get a (often) required default value.

See the [MaMpok wiki](https://gitlab.gwdg.de/loosolab/software/mampok/-/wikis/Getting-Started/MaMPlan_keys) for more detailed information about each parameter.

#### 1.2.1 - Parameters
| Parameter | Description | Options | |
|:---:|:---|:---|:---|
| project_id | Project ID, e.g. 'ext123', 'dst123' | str | Required |
| datatype | Technique used, e.g. 'sc RNA-seq', 'sc ATAC-seq' | [Options](https://gitlab.gwdg.de/loosolab/software/metadata_whitelists/-/blob/main/whitelists/technique?ref_type=heads) or list of [Options](https://gitlab.gwdg.de/loosolab/software/metadata_whitelists/-/blob/main/whitelists/technique?ref_type=heads) | Required |
| tool | Select the cellxgene docker container. | 'cxg' for cellxgene, 'cxgv' for cellxgene VIP. Other [Options](https://gitlab.gwdg.de/loosolab/software/metadata_whitelists/-/blob/main/whitelists/abbrev/tool?ref_type=heads) | Required |
| cluster | Select the kubernetes cluster. (Deprecated clusters: 'GI', 'GWDG') | 'BN', 'BN_public' | Required |
| owner | Owner / Responsible person of the dataset. Set to public if public dataset.| LDAP user ID or public | Required |
| organization | Select organizations related to the project.<br> Every user in one of the organizations will be able to access the dataset via the BCU repository. | [Options](https://gitlab.gwdg.de/loosolab/software/metadata_whitelists/-/blob/main/whitelists/department?ref_type=heads) | Required |
| check_online | If True, validate certain parameters using an online database. | bool | Required |
| auth | Controls authorization requirement. Access the deployment through the BCU website (True) or via direct URL (False). Is only set to false in special cases e.g. link is mentioned in a publication. | bool | Required |
| label | Set label shown in the browser tab. | str | Optional |
| user | List of users that, additonally to the organization, get access to the dataset via the BCU repository.  | List of LDAP user IDs | Optional |
| analyst | Analyst of the dataset. If None, analyst is set as current user. | List of LDAP user ID; LDAP user ID or None |  Optional |
| pubmedid | Pubmed ID of public datasets. | Pubmed user ID | Optional |
| citation | Citation of public dataset. Will be set automatically if None and if pubmed ID is given.| str or None for automated citation| Optional |
| cpu_limit | Set the limit of cpu cores that can be used by the deplyoment. | int | Optional |
| mem_limit | Set the limit (in GB) of memory that can be used by the deplyoment. | int | Optional |
| cpu_request | Set the requested amount of cpu cores that can be used by the deplyoment. | int | Optional |
| mem_request | Set the requested amount (in GB) of memory that can be used by the deplyoment. | int | Optional |
| email | Checking the pubmed ID might require a valid E-mail to be set. | str | Optional |

___

## 2 - Setup

In [None]:
from sctoolbox import settings
import sctoolbox.utils as utils
from packaging import version
import pandas as pd

___

## 3 - General Input

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# sctoolbox settings
settings.adata_input_dir = "../adatas/"
settings.adata_output_dir = "../adatas/cellxgene/"
settings.log_file = "../logs/prepare_for_cellxgene_log.txt"
last_notebook_adata = "anndata_4.h5ad"

# MaMPlan options
check_online = True

## Project options
project_id = "Test-ID"
datatype = "sc RNA-seq"
tool = "cxg"  # cxgv
cluster = "BN"
organization = ["AG-nerds"]
label = None
user = None
owner = "Test-owner"
analyst = None

## Options for public datasets
pubmedid = None
citation = None
email = None

## Options for computational resource manangemnt

### Limit
cpu_limit = None
mem_limit = None
### Requested
cpu_request = None
mem_request = None

auth = True  # Only set to False in special cases!

mamplan_filename = f"{project_id}_MaMPlan.yaml"

---------

## 4 - Load anndata

In [None]:
adata = utils.adata.load_h5ad(last_notebook_adata)
display(adata)

___

## 5 - Prepare adata for cellxgene
<hr style="border:2px solid black"> </hr>

The cellxgene preparation removes all data from the anndata object that is not required for the cellxgene deplyoment.  
This saves memory on the cluster and decreases runtime.

In addition, every invalid or problematic datatype is checked for and cast to a fitting datatype if possible.

**Note: Keep in mind that the resulting adata object should not be used for further analysis.**

In [None]:
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Keep columns in adata.obs (Cell metadata)
# Set to None to keep all obs columns
# An empty list removes all columns
keep_obs = [
    "sample",
    "batch",
    "celltype",
    "pct_counts_is_mito",
    "pct_counts_is_ribo",
    "phase",
    "clustering",
    "SCSA_pred_celltype",
    "marker_pred_celltype"
]

# Rename columns in adata.obs
# Set to None to keep all original names
rename_obs = {
    "sample": "Sample",
    "batch": "Batch",
    "celltype": "Celltype",
    "pct_counts_is_mito": "Mitochondiral content (%)",
    "pct_counts_is_ribo": "Ribosomal content (%)",
    "phase": "Phase",
    "clustering": "Final Clustering",
    "SCSA_pred_celltype": "Predicted Celltype (SCSA)",
    "marker_pred_celltype": "Predicted Celltype (Marker)"
}

add_leiden = True  # Add leiden columns to keep_obs. Is only considered if keep_obs is not None

# Keep columns in adata.var (Gene metadata)
# An empty list removes all columns
keep_var = []
rename_var = {}

___

### 5.1 - Add leiden columns

In [None]:
if add_leiden and keep_obs is not None:
    leiden_cols = [col for col in adata.obs.columns if col.startswith("leiden")]
    keep_obs += leiden_cols
    rename_obs |= {c: c.replace("_", " ").capitalize() for c in leiden_cols}

___

### 5.2 - Clean up adata

In [None]:
utils.adata.prepare_for_cellxgene(
    adata,
    keep_obs=keep_obs,
    keep_var=keep_var,
    rename_obs=rename_obs,
    rename_var=rename_var,
    inplace=True
)

In [None]:
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

___

### 5.3 - Save adata

In [None]:
# Saving the data
adata_output = f"{project_id}_cellxgene.h5ad"
utils.adata.save_h5ad(adata, adata_output)

___

## 6 - Write MaMPlan

In [None]:
try:
    import mampok
    import mampok.mamplan_creator as mc
    if version.parse(mampok.__version__) < version.parse("3.0.5"):
        raise ModuleNotFoundError()
except ModuleNotFoundError:
    raise ModuleNotFoundError("Please install the latest mampok version.")

In [None]:
mamplan = mc.SimpleMamplan(
    exp_id = project_id,
    files = adata_output,
    tool = tool,
    analyst = analyst if analyst else utils.general.get_user(),
    datatype = datatype,
    cluster = cluster,
    label = label,
    organization = organization,
    user = user,
    owner = owner,
    pubmedid = pubmedid,
    citation = citation,
    cpu_limit = cpu_limit,
    mem_limit = mem_limit,
    cpu_request = cpu_request,
    mem_request = mem_request,
    check_online = check_online,
    auth = auth,
    email = email
)

In [None]:
mamplan.save(f"{settings.adata_output_dir}/{project_id}")