# TAPE Deconvolution

In [5]:
from benchmark_utils import (
    preprocess_scrna,
    create_uniform_pseudobulk_dataset,
    create_signature,
    add_cell_types_grouped,
)

from TAPE import Deconvolution
from TAPE.deconvolution import ScadenDeconvolution

import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc

In [6]:
from constants import (
    SIGNATURE_CHOICE,
    BENCHMARK_CELL_TYPE_GROUP,
    N_CELLS,
)

In [7]:
adata = sc.read("/home/owkin/project/cti/cti_adata.h5ad")
preprocess_scrna(adata,
                  keep_genes=3000,
                  log=False,
                  batch_key="donor_id")

adata

AnnData object with n_obs × n_vars = 329762 × 3000
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_ontology_term_id_colors', 'default_embedding', 'schema_version', 'sex_ontology_term_id_colors', 'title', 'hvg'
    obsm: 'X_umap'
    layers: 'counts', 'relative_co

In [8]:
# %% add cell types groups and split train/test
adata, train_test_index = add_cell_types_grouped(adata, 
                                                 BENCHMARK_CELL_TYPE_GROUP)
adata_train = adata[train_test_index["Train index"]]
adata_test = adata[train_test_index["Test index"]]

adata_train.shape, adata_test.shape

((136970, 3000), (136970, 3000))

In [9]:
signature, intersection = create_signature(
    adata,
    signature_type="crosstissue_granular_updated",
)

signature.shape

(579, 9)

In [10]:
signature, intersection = create_signature(
    adata,
    signature_type=SIGNATURE_CHOICE,
)

adata, train_test_index = add_cell_types_grouped(adata, BENCHMARK_CELL_TYPE_GROUP)
adata_train = adata[train_test_index["Train index"]]
adata_test = adata[train_test_index["Test index"]]

In [11]:
adata

AnnData object with n_obs × n_vars = 329762 × 3000
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'cell_types_grouped'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_ontology_term_id_colors', 'default_embedding', 'schema_version', 'sex_ontology_term_id_colors', 'title', 'hvg'
    obsm: 'X_umap'
    layers: 

In [12]:
adata_pseudobulk_test, df_proportions_test = create_uniform_pseudobulk_dataset(
    adata_test,
    n_sample = 3000,
    n_cells = N_CELLS,
)

[32m2023-12-20 09:51:12.382[0m | [1mINFO    [0m | [36mbenchmark_utils.dataset_utils[0m:[36mcreate_uniform_pseudobulk_dataset[0m:[36m155[0m - [1mCreating uniform pseudobulk dataset...[0m


In [13]:
signature.T

Genes,ENSG00000124731,ENSG00000166523,ENSG00000120594,ENSG00000167476,ENSG00000105246,ENSG00000117281,ENSG00000138061,ENSG00000171611,ENSG00000109861,ENSG00000117984,...,ENSG00000227507,ENSG00000072694,ENSG00000165168,ENSG00000172236,ENSG00000125844,ENSG00000095303,ENSG00000142634,ENSG00000196141,ENSG00000118513,ENSG00000166825
CD4T,0.005198,0.001013,0.002633,0.004433,0.000653,0.024394,0.000743,0.000248,0.318721,0.382632,...,4.452371,0.002588,0.009587,0.002048,0.103652,0.001598,0.383824,0.070324,0.018228,0.011387
CD8T,0.008085,0.001386,0.004323,0.006534,0.000627,0.529024,0.000825,0.00033,0.774511,0.662509,...,1.106788,0.013431,0.013761,0.003003,0.131604,0.001188,0.780682,0.052305,0.008283,0.041778
B,0.005222,0.000848,0.001919,0.009998,0.049277,0.001919,0.000536,0.000491,0.194519,0.23835,...,5.821818,0.453223,0.629441,0.000714,0.067622,0.045037,0.34476,0.012141,0.005803,0.002678
Plasma,0.038346,0.004762,0.018546,2.285965,0.033835,0.013283,0.002506,0.010276,1.105263,1.88797,...,0.130075,0.758145,0.311779,0.017794,6.521303,0.014286,0.947619,0.351128,0.037845,0.0401
DC,0.351897,0.118644,0.920904,0.011299,0.697337,0.002421,0.071832,0.72155,2.567393,2.186441,...,4.632768,1.125101,1.702179,0.023406,1.765133,0.213075,3.33979,0.736885,0.093624,0.583535
NK,0.005696,0.001492,0.015801,0.004137,0.002374,0.734436,0.001017,0.000136,1.202496,1.77933,...,0.310728,0.059677,0.011732,0.003594,0.368778,0.002984,2.67286,0.039604,0.001221,0.012003
Mono,0.992748,0.694824,1.25375,0.006263,0.027196,0.003791,1.016153,0.005357,1.223339,8.797264,...,0.13763,0.120158,5.75581,0.007994,0.685841,0.055382,3.139443,0.103099,0.008159,1.21996
Mast,0.010942,0.006687,0.131307,0.003647,0.005471,0.007903,0.004255,0.0,0.192097,3.369605,...,0.064438,0.003647,0.017021,39.819453,0.43769,0.736778,1.106991,0.40304,0.2,0.027964
Tregs,0.004447,0.000494,0.003788,0.009881,0.032115,0.003623,0.000329,0.000659,0.711133,0.502635,...,4.954381,0.003294,0.009881,0.002141,0.112813,0.000988,0.344697,0.200758,0.08251,0.005105


In [14]:
adata_train

View of AnnData object with n_obs × n_vars = 136970 × 3000
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'cell_types_grouped'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_ontology_term_id_colors', 'default_embedding', 'schema_version', 'sex_ontology_term_id_colors', 'title', 'hvg'
    obsm: 'X_umap'
    

In [None]:
SignatureMatrix, CellFractionPrediction = \
    Deconvolution(signature.T, pseudobulk, sep='\t', scaler='mms',
                  datatype='counts', genelenfile=None,
                  mode='overall', adaptive=True, variance_threshold=0.98,
                  save_model_name=None,
                  batch_size=128, epochs=128, seed=1)

In [None]:
SignatureMatrix

In [None]:
CellFractionPrediction

In [None]:
df_proportions = pd.DataFrame(np.stack([proportions[i].values for i in range(len(proportions))]),
                                    index=pseudobulk.index,
                                    columns=list(proportions[0].index))

In [None]:
df_proportions.mean(axis=0)

In [None]:
corr_tape = compute_correlations(df_proportions, CellFractionPrediction)

In [None]:
corr_tape["correlations"].mean()

In [None]:
Pred = ScadenDeconvolution(signature, pseudobulk, sep='\t',
                           batch_size=128, epochs=128)

In [None]:
genes = np.intersect1d(pseudobulk.columns, signature.index)

input_data = pseudobulk.loc[:, genes].T

SignatureMatrix.shape, signature.shape, pseudobulk.shape, input_data.shape

In [None]:
deconv_results = perform_nnls(signature, input_data.T)

In [None]:
# import pandas as pd

# df = pd.read_csv("/home/owkin/Tabula_Sapiens_metadata.csv", index_col=0)
# df = df[df["method"]=="10X"]
# # (df["cell_ontology_class"].value_counts(normalize=True).to_frame() * 100).to_csv("TS_cell_freq.csv")
# columns_to_combine = ["organ_tissue", "cell_ontology_class"]
# combination_frequencies = df.groupby(columns_to_combine).size().reset_index(name='frequency')
# total_combinations = len(df)
# combination_frequencies['normalized_frequency'] = combination_frequencies['frequency'] / total_combinations
# combination_frequencies.to_csv("TS_cell_organ_combo_freq.csv")