In [1]:
import scanpy as sc
import pandas as pd

In [2]:
genes_intersection = pd.read_table("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/make_big_dataset/genes_shared_across_datasets.txt")
sampled_genes = genes_intersection['genes_shared_across_datasets'].sample(n=3000, random_state=42).tolist()

In [3]:
adata = sc.read_h5ad(f"/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/train_data/full_datasets/train_adata_baseline.h5ad")

In [26]:
adata_train = adata[:,adata.var.index.isin(sampled_genes)]

In [60]:
adata_to_predict = sc.read_h5ad(f"/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/data_to_predict/wang_to_predict.h5ad")
dbdb_ground_truth = sc.read_h5ad(f"/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/ground_truth/full_datasets/dbdb_ground_truth.h5ad")
mSTZ_ground_truth = sc.read_h5ad(f"/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/ground_truth/full_datasets/mSTZ_ground_truth.h5ad")

In [22]:
from itertools import combinations

def check_adatas_var_index(*adatas):
    """Check if the variable indices of all provided AnnData objects are the same and in the same order."""
    for i, j in combinations(range(len(adatas)), 2):
        if not all(adatas[i].var.index == adatas[j].var.index):
            raise ValueError(f"The variable indices of the AnnData objects at positions {i} and {j} do not match!")
    print("Everything ok!")

In [27]:
adata_train = adata_train[:,adata_train.var.sort_index().index]

In [56]:
adata_to_predict = adata_to_predict[:,adata_to_predict.var.index.isin(sampled_genes)]
dbdb_ground_truth = dbdb_ground_truth[:,dbdb_ground_truth.var.index.isin(sampled_genes)]
mSTZ_ground_truth = mSTZ_ground_truth[:,mSTZ_ground_truth.var.index.isin(sampled_genes)]

In [30]:
check_adatas_var_index(adata_train, adata_to_predict[:,adata_to_predict.var.sort_index().index], dbdb_ground_truth, mSTZ_ground_truth)

Everything ok!


In [33]:
adata_to_predict.obs.cell_type.unique()

['beta', 'alpha', 'delta', 'gamma']
Categories (4, object): ['alpha', 'beta', 'delta', 'gamma']

In [38]:
dbdb_ground_truth.obs.cell_type.value_counts()

type B pancreatic cell       31983
hematopoietic cell           13369
pancreatic A cell             6587
pancreatic D cell             4641
pancreatic stellate cell      3698
endothelial cell              2977
pancreatic ductal cell        2906
pancreatic endocrine cell     1550
pancreatic PP cell            1209
cell                           408
pancreatic acinar cell         326
native cell                     46
Schwann cell                    45
Name: cell_type, dtype: int64

In [35]:
dbdb_ground_truth

View of AnnData object with n_obs × n_vars = 69745 × 3000
    obs: 'disease', 'dataset', 'organism', 'cell_type'

In [36]:
adata_to_predict

View of AnnData object with n_obs × n_vars = 49596 × 3000
    obs: 'disease', 'cell_type', 'organism', 'dataset', 'n_counts'
    uns: 'log1p'

In [39]:
mSTZ_ground_truth.obs.cell_type.value_counts()

type B pancreatic cell       14329
pancreatic A cell            11654
pancreatic D cell             7577
hematopoietic cell            5750
pancreatic endocrine cell     4128
pancreatic PP cell            2442
endothelial cell              1709
pancreatic stellate cell      1317
pancreatic ductal cell         253
Schwann cell                   186
cell                           138
pancreatic acinar cell          32
native cell                     30
Name: cell_type, dtype: int64

In [55]:
adata_to_predict.obs.cell_type.value_counts()

Series([], Name: cell_type, dtype: int64)

In [63]:
adata_to_predict = adata_to_predict[~(adata_to_predict.obs.cell_type == "gamma")]

In [61]:
adata_to_predict.obs["cell_type"] = adata_to_predict.obs.cell_type.replace({"alpha": "pancreatic A cell",
                                        "beta": "type B pancreatic cell",
                                        "delta": "pancreatic D cell"})

In [64]:
adata_to_predict.obs

Unnamed: 0,disease,cell_type,organism,dataset,n_counts
A0019_AAACAGCCAGGACCAA-1,healthy,type B pancreatic cell,Homo sapiens,Wang,3698.0
A0019_AAACAGCCATGAGTTT-1,healthy,type B pancreatic cell,Homo sapiens,Wang,4016.0
A0019_AAACATGCAGCTTAAT-1,healthy,pancreatic A cell,Homo sapiens,Wang,4438.0
A0019_AAACCAACACTATGGC-1,healthy,type B pancreatic cell,Homo sapiens,Wang,3437.0
A0019_AAACGCGCAGCTAACC-1,healthy,pancreatic A cell,Homo sapiens,Wang,3777.0
...,...,...,...,...,...
C0027_TTTGTTGGTGAGGTAG-1,healthy,pancreatic A cell,Homo sapiens,Wang,1667.0
C0027_TTTGTTGGTGCTCCGT-1,healthy,type B pancreatic cell,Homo sapiens,Wang,2320.0
C0027_TTTGTTGGTGTGTCCC-1,healthy,type B pancreatic cell,Homo sapiens,Wang,3328.0
C0027_TTTGTTGGTTTCGCGC-1,healthy,type B pancreatic cell,Homo sapiens,Wang,2507.0


In [67]:
adata_to_predict.write_h5ad("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/data_to_predict/wang_to_predict_cleanCT.h5ad")

In [70]:
cts_to_consider = adata_to_predict.obs.cell_type.unique().tolist()

In [75]:
dbdb_ground_truth = dbdb_ground_truth[dbdb_ground_truth.obs.cell_type.isin(cts_to_consider)]

In [76]:
dbdb_ground_truth.write_h5ad("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/ground_truth/full_datasets/dbdb_ground_truth_cleanCT.h5ad")

In [77]:
mSTZ_ground_truth = mSTZ_ground_truth[mSTZ_ground_truth.obs.cell_type.isin(cts_to_consider)]

In [79]:
mSTZ_ground_truth.write_h5ad("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/ground_truth/full_datasets/mSTZ_ground_truth_cleanCT.h5ad")