In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import anndata as ad
from scipy import sparse
from anndata import AnnData

## Load Anndata and deprecate to bare minimum

In [2]:
adata = sc.read("/mnt/biocluster/projekte/Corona2020/ideas_leonie_pohl/tempDir/merged_data.h5ad")

Needed obs columns:
assay_ontology_term_id    categorical with str categories. This MUST be an EFO term    ("EFO:0030080" and "EFO:0008722" for Schiller)
cell_type_ontology_term_id    categorical with str categories. This MUST be a CL term.    "CL_0000548"
development_stage_ontology_term_id     categorical with str categories. If unavailable, this MUST be "unknown".    "unknown"
disease_ontology_term_id    categorical with str categories. This MUST be a MONDO term or "PATO:0000461" for normal or healthy.    "MONDO:0002771"
donor_id    categorical with str categories. This MUST be free-text that identifies a unique individual that data were derived from.    =batch
is_primary_data    bool. This MUST be True if this is the canonical instance of this cellular observation and False if not.    True
organism_ontology_term_id    categorical with str categories. This MUST be a child of NCBITaxon:33208 for Metazoa.    NCBITaxon:10090
self_reported_ethnicity_ontology_term_id    Otherwise, for all other organisms this MUST be "na".    "na"
sex_ontology_term_id    categorical with str categories. This MUST be a child of PATO:0001894 for phenotypic sex or "unknown" if unavailable.    "unknown"
suspension_type    "cell" or "nucleus"    "cell"
tissue_ontology_term_id    categorical with str categories. This MUST be the UBERON or CL term    "UBERON:0002048"
assay    human readable name assigned to the    "10x transcription profiling" or "Drop-seq"
cell_type    human readable name assigned to the cell_type_ontology_term_id    =coarse_harmonized_anno
development_stage    "unknown"
disease     human-readable name assigned to the value of disease_ontology_term_id    "pulmonary fibrosis"
organism    "Mus musculus"
self_reported_ethnicity    "na"
sex    "unknown"
tissue    "lung"

In [3]:
#set X to raw counts
adata.X = adata.layers["raw_counts"]

In [4]:
unneededobs = ['author_annotation', 'scDblFinder_score', 'scDblFinder_class', 'manual_celltype_annotation', 'doublet_score', 'predicted_doublet', 'timepoint','author_annotation_coarse', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'low_hierarchy', 'high_hierarchy',  'low_hierarchy_fine', 'high_hierarchy_fine', 'low_hierarchy_coarse', 'high_hierarchy_coarse']

In [5]:
adata.obs = adata.obs.drop(labels = unneededobs, axis = 1)

In [6]:
adata.obs

Unnamed: 0,batch,condition,dataset,fibrotic/control,harmonized_anno,coarse_harmonized_anno
AAACCTGAGGACATTA-1_xie,1_xie,untreated,xie,control,Myofibroblasts,Myofibroblasts
AAACCTGCAGTCGTGC-1_xie,1_xie,untreated,xie,control,Peribronchial fibroblasts,Fibroblasts
AAACCTGCATGAAGTA-1_xie,1_xie,untreated,xie,control,Alveolar macrophages,Alveolar macrophages
AAACCTGTCTCGCATC-1_xie,1_xie,untreated,xie,control,Peribronchial fibroblasts,Fibroblasts
AAACGGGTCCTAGAAC-1_xie,1_xie,untreated,xie,control,Peribronchial fibroblasts,Fibroblasts
...,...,...,...,...,...,...
29291_schiller,muc4657_schiller,untreated,schiller,control,Alveolar fibroblasts,Fibroblasts
29292_schiller,muc4657_schiller,untreated,schiller,control,AT1/2,Epithelial
29293_schiller,muc4657_schiller,untreated,schiller,control,Alveolar fibroblasts,Fibroblasts
29294_schiller,muc4657_schiller,untreated,schiller,control,AT1/2,Epithelial


## Set obs columns according to CELLxGENE schema

In [7]:
#assay_ontology_term_id    categorical with str categories. This MUST be an EFO term    ("EFO:0030080" and "EFO:0008722" for Schiller)
adata.obs.loc[(adata.obs['dataset'] != 'schiller'),
       'assay_ontology_term_id'] = "EFO:0030080"
adata.obs.loc[(adata.obs['dataset'] == 'schiller'),
       'assay_ontology_term_id'] = "EFO:0008722"

In [8]:
#assay    human readable name assigned to the    "10x transcription profiling" or "Drop-seq"
adata.obs.loc[(adata.obs['dataset'] != 'schiller'),
       'assay'] = "10x transcription profiling"
adata.obs.loc[(adata.obs['dataset'] == 'schiller'),
       'assay'] = "Drop-seq"

In [9]:
#cell_type_ontology_term_id    categorical with str categories. This MUST be a CL term.    "CL_0000548"
#cell_type    human readable name assigned to the cell_type_ontology_term_id    =coarse_harmonized_anno
adata.obs['cell_type_ontology_term_id'] = "CL_0000548"
adata.obs['cell_type'] = adata.obs["coarse_harmonized_anno"]

In [10]:
#development_stage_ontology_term_id     categorical with str categories. If unavailable, this MUST be "unknown".    "unknown"
#development_stage    "unknown"
adata.obs['development_stage_ontology_term_id'] = "unknown"
adata.obs['development_stage'] = "unknown"

In [11]:
#disease_ontology_term_id    categorical with str categories. This MUST be a MONDO term or "PATO:0000461" for normal or healthy.    "MONDO:0002771"
#disease     human-readable name assigned to the value of disease_ontology_term_id    "pulmonary fibrosis"
adata.obs['disease_ontology_term_id'] = "MONDO:0002771"
adata.obs['disease'] = "pulmonary fibrosis"

In [12]:
#donor_id    categorical with str categories. This MUST be free-text that identifies a unique individual that data were derived from.    =batch
adata.obs['donor_id'] = adata.obs["batch"]

In [13]:
#is_primary_data    bool. This MUST be True if this is the canonical instance of this cellular observation and False if not.    True
adata.obs['is_primary_data'] = True

In [14]:
#organism_ontology_term_id    categorical with str categories. This MUST be a child of NCBITaxon:33208 for Metazoa.    NCBITaxon:10090
#organism    "Mus musculus"
adata.obs['organism_ontology_term_id'] = "NCBITaxon:10090"
adata.obs['organism'] = "Mus musculus"

In [15]:
#self_reported_ethnicity_ontology_term_id    Otherwise, for all other organisms this MUST be "na".    "na"
#self_reported_ethnicity    "na"
adata.obs['self_reported_ethnicity_ontology_term_id'] = "na"
adata.obs['self_reported_ethnicity'] = "na"

In [16]:
#sex_ontology_term_id    categorical with str categories. This MUST be a child of PATO:0001894 for phenotypic sex or "unknown" if unavailable.    "unknown"
#sex    "unknown"
adata.obs['sex_ontology_term_id'] = "unknown"
adata.obs['sex'] = "unknown"

In [17]:
#suspension_type    "cell" or "nucleus"    "cell"
adata.obs['suspension_type'] = "cell"

In [18]:
#tissue_ontology_term_id    categorical with str categories. This MUST be the UBERON or CL term    "UBERON:0002048"
#tissue    "lung"
adata.obs['tissue_ontology_term_id'] = "UBERON:0002048"
adata.obs['tissue'] = "lung"

## Make output look like this:
.X raw counts, sparse format

.uns['meta'] metadata from TSV file

.obs columns from CELLxGENE schema 3.0.0 and a subset of information in .uns['meta'] from EXTRA_COLUMNS from scripts/utils.py

.obs['dataset'], .uns['dataset'] name of task/dataset

.obs['organ'], .uns['organ'] organ

.obs['donor'] donor ID

.obs['sample'] sample ID (inferred from input TSV)

.obs['barcode'] cell barcodes as declared in index

.obs['author_annotation'] author annotation under the author_annotation column of the input TSV

.var gene information as specified in CELLxGENE schema 3.0.0  -> we didn´t do this, possible error source

.obs.index unique cell identifiers e.g. dataset + numerical index

In [19]:
adata.obs['organ'] = adata.obs["tissue"]
adata.obs['donor'] = adata.obs["donor_id"]
adata.obs['sample'] = adata.obs["donor_id"]
adata.obs['barcode'] = adata.obs.index
adata.obs['author_annotation'] = adata.obs['coarse_harmonized_anno']

In [22]:
#evtl noch TODO
adata.var["counts"]
- counts
        - normalize
        - highly_variable_genes

AnnData object with n_obs × n_vars = 93103 × 32317
    obs: 'batch', 'condition', 'dataset', 'fibrotic/control', 'harmonized_anno', 'coarse_harmonized_anno', 'assay_ontology_term_id', 'assay', 'cell_type_ontology_term_id', 'cell_type', 'development_stage_ontology_term_id', 'development_stage', 'disease_ontology_term_id', 'disease', 'donor_id', 'is_primary_data', 'organism_ontology_term_id', 'organism', 'self_reported_ethnicity_ontology_term_id', 'self_reported_ethnicity', 'sex_ontology_term_id', 'sex', 'suspension_type', 'tissue_ontology_term_id', 'tissue', 'organ', 'donor', 'sample', 'barcode', 'author_annotation'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'batch_colors', 'dataset_colors', 'high_hierarchy_colors', 'hvg', 'log1p', 'low_hierarchy_colors'

In [20]:
adata.write("/mnt/biocluster/projekte/Corona2020/ideas_leonie_pohl/tempDir/integration_ready_data.h5ad")

In [3]:
adata=sc.read("/mnt/biocluster/projekte/Corona2020/ideas_leonie_pohl/tempDir/integration_ready_data.h5ad")

In [15]:
preprocessing = {"hvg":2007, "scaled":True}
adata.uns['preprocessing']=preprocessing

In [20]:
adata.layers['normcounts'] = adata.layers['log1p_norm']

In [23]:
adata.uns['dataset']="integration_ready_data"

In [24]:
adata.write("/mnt/biocluster/projekte/Corona2020/ideas_leonie_pohl/tempDir/integration_ready_data.h5ad")