In [1]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scvi
import celltypist
from celltypist import models

In [2]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [3]:
models.get_all_models()

['Pan_Fetal_Human.pkl',
 'Human_PF_Lung.pkl',
 'Healthy_COVID19_PBMC.pkl',
 'Developing_Human_Organs.pkl',
 'Mouse_Dentate_Gyrus.pkl',
 'Cells_Lung_Airway.pkl',
 'Cells_Fetal_Lung.pkl',
 'Adult_Human_Skin.pkl',
 'Adult_RhesusMacaque_Hippocampus.pkl',
 'Adult_Human_PancreaticIslet.pkl',
 'Fetal_Human_Pancreas.pkl',
 'Human_Longitudinal_Hippocampus.pkl',
 'Developing_Human_Hippocampus.pkl',
 'Adult_Pig_Hippocampus.pkl',
 'Mouse_Postnatal_DentateGyrus.pkl',
 'Cells_Adult_Breast.pkl',
 'Human_Colorectal_Cancer.pkl',
 'Human_Placenta_Decidua.pkl',
 'Fetal_Human_Pituitary.pkl',
 'Adult_Human_Vascular.pkl',
 'Fetal_Human_AdrenalGlands.pkl',
 'Nuclei_Lung_Airway.pkl',
 'Human_Endometrium_Atlas.pkl',
 'Developing_Human_Brain.pkl',
 'Fetal_Human_Retina.pkl',
 'Human_IPF_Lung.pkl',
 'Adult_COVID19_PBMC.pkl',
 'Healthy_Adult_Heart.pkl',
 'Human_Lung_Atlas.pkl',
 'Mouse_Whole_Brain.pkl',
 'Adult_Mouse_OlfactoryBulb.pkl',
 'COVID19_HumanChallenge_Blood.pkl',
 'Human_Embryonic_YolkSac.pkl',
 'Human_D

In [4]:
model_low = models.Model.load(model='Human_Colorectal_Cancer.pkl')

In [5]:
def predict_cells(adata):
    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.normalize_total(adata, target_sum=1e4) #not recommended for typical pp
    sc.pp.log1p(adata)

    

    predictions = celltypist.annotate(adata, model=model_low, majority_voting=False)
    predictions_adata = predictions.to_adata()
    adata.obs["low_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
    adata.obs["low_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]


    return adata.obs

In [6]:
adatas = [sc.read_h5ad('pp_adata/' + x) for x in os.listdir('pp_adata')]

In [7]:
len(adatas)

7

In [8]:
predictions = [predict_cells(ad.copy()) for ad in adatas] #we pass a copy so it doesnt normalize the counts

üî¨ Input data has 3197 cells and 15131 genes
üîó Matching reference genes in the model
üß¨ 4037 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üî¨ Input data has 2918 cells and 13405 genes
üîó Matching reference genes in the model
üß¨ 3624 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üî¨ Input data has 2256 cells and 15734 genes
üîó Matching reference genes in the model
üß¨ 4069 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üî¨ Input data has 38228 cells and 19748 genes
üîó Matching reference genes in the model
üß¨ 4050 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üî¨ Input data has 4392 cells and 15221 genes
üîó Matching reference genes in the model
üß¨ 3998 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Predic

In [9]:
predictions = pd.concat(predictions)[['low_label', 'low_score',]]

predictions

Unnamed: 0,low_label,low_score
711_AAACCCAAGTCGGGAT-1,T follicular helper cells,0.103311
711_AAACCCACAGAGGAAA-1,Unknown,0.236022
711_AAACCCACATGATAGA-1,Unknown,0.400460
711_AAACCCAGTCTCGCGA-1,Tip-like ECs,0.966648
711_AAACGAAGTTATCTTC-1,gamma delta T cells,0.993837
...,...,...
706_TTTGGTTCAAACACCT-1,CD8+ T cells,0.999997
706_TTTGGTTCAACGGGTA-1,Pro-inflammatory,0.549964
706_TTTGGTTTCTATCGCC-1,Stromal 2,0.961222
706_TTTGTTGCATCAGCGC-1,CMS2,0.978673


In [10]:
adata = sc.concat(adatas)
adata

AnnData object with n_obs √ó n_vars = 58015 √ó 18505
    obs: 'samples', 'condition', 'location', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'

In [11]:
predictions.to_csv('PREDICTIONS.csv')

In [12]:
adata.obs = adata.obs.merge(right = predictions, left_index=True, right_index=True)

In [13]:
adata.obs

Unnamed: 0,samples,condition,location,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,low_label,low_score
711_AAACCCAAGTCGGGAT-1,711,Tumor,Right,707,707,6.562444,1396.0,7.242083,26.862464,10.100286,25.716331,0.000000,T follicular helper cells,0.103311
711_AAACCCACAGAGGAAA-1,711,Tumor,Right,838,838,6.732211,1504.0,7.316548,23.803191,9.441490,13.962767,0.000000,Unknown,0.236022
711_AAACCCACATGATAGA-1,711,Tumor,Right,435,435,6.077642,613.0,6.419995,17.781403,0.815661,18.270800,0.000000,Unknown,0.400460
711_AAACCCAGTCTCGCGA-1,711,Tumor,Right,579,579,6.363028,860.0,6.758094,23.255814,15.465117,6.395349,0.000000,Tip-like ECs,0.966648
711_AAACGAAGTTATCTTC-1,711,Tumor,Right,1384,1384,7.233455,2629.0,7.874739,19.246862,8.178015,15.405098,0.000000,gamma delta T cells,0.993837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706_TTTGGTTCAAACACCT-1,706,Tumor,Left,1815,1815,7.504392,4944.0,8.506132,21.318770,1.961974,21.318771,0.020227,CD8+ T cells,0.999997
706_TTTGGTTCAACGGGTA-1,706,Tumor,Left,909,909,6.813445,3528.0,8.168770,34.722222,0.368481,25.368483,0.000000,Pro-inflammatory,0.549964
706_TTTGGTTTCTATCGCC-1,706,Tumor,Left,4658,4658,8.446556,23600.0,10.069044,17.559322,3.237288,18.135593,0.004237,Stromal 2,0.961222
706_TTTGTTGCATCAGCGC-1,706,Tumor,Left,374,374,5.926926,686.0,6.532334,23.032070,10.932944,33.236153,0.000000,CMS2,0.978673


In [14]:
adata.write_h5ad('unintigrated.h5ad')

... storing 'low_label' as categorical
