In [1]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scvi
import celltypist
from celltypist import models

In [2]:
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)

In [3]:
models.get_all_models()

['Pan_Fetal_Human.pkl',
 'Human_PF_Lung.pkl',
 'Healthy_COVID19_PBMC.pkl',
 'Developing_Human_Organs.pkl',
 'Mouse_Dentate_Gyrus.pkl',
 'Cells_Lung_Airway.pkl',
 'Cells_Fetal_Lung.pkl',
 'Adult_Human_Skin.pkl',
 'Adult_RhesusMacaque_Hippocampus.pkl',
 'Adult_Human_PancreaticIslet.pkl',
 'Fetal_Human_Pancreas.pkl',
 'Human_Longitudinal_Hippocampus.pkl',
 'Developing_Human_Hippocampus.pkl',
 'Adult_Pig_Hippocampus.pkl',
 'Mouse_Postnatal_DentateGyrus.pkl',
 'Cells_Adult_Breast.pkl',
 'Human_Colorectal_Cancer.pkl',
 'Human_Placenta_Decidua.pkl',
 'Fetal_Human_Pituitary.pkl',
 'Adult_Human_Vascular.pkl',
 'Fetal_Human_AdrenalGlands.pkl',
 'Nuclei_Lung_Airway.pkl',
 'Human_Endometrium_Atlas.pkl',
 'Developing_Human_Brain.pkl',
 'Fetal_Human_Retina.pkl',
 'Human_IPF_Lung.pkl',
 'Adult_COVID19_PBMC.pkl',
 'Healthy_Adult_Heart.pkl',
 'Human_Lung_Atlas.pkl',
 'Mouse_Whole_Brain.pkl',
 'Adult_Mouse_OlfactoryBulb.pkl',
 'COVID19_HumanChallenge_Blood.pkl',
 'Human_Embryonic_YolkSac.pkl',
 'Human_D

In [4]:
model_low = models.Model.load(model='Human_Colorectal_Cancer.pkl')

In [9]:
def predict_cells(adata):
    sc.pp.filter_genes(adata, min_cells = 10)
    sc.pp.normalize_total(adata, target_sum=1e4) #not recommended for typical pp
    sc.pp.log1p(adata)

    

    predictions = celltypist.annotate(adata, model=model_low, majority_voting=False)
    predictions_adata = predictions.to_adata()
    adata.obs["low_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
    adata.obs["low_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]


    return adata.obs

In [6]:
adatas = [sc.read_h5ad('pp_adata/' + x) for x in os.listdir('pp_adata')]

In [7]:
len(adatas)

1

In [10]:
predictions = [predict_cells(ad.copy()) for ad in adatas] #we pass a copy so it doesnt normalize the counts

üî¨ Input data has 38228 cells and 19748 genes
üîó Matching reference genes in the model
üß¨ 4050 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!


In [11]:
predictions = pd.concat(predictions)[['low_label', 'low_score',]]

predictions

Unnamed: 0,low_label,low_score
B_cac10_AAACCTGCACTTCGAA,CD19+CD20+ B,1.000000
B_cac10_AAACCTGGTAATTGGA,CD4+ T cells,0.889012
B_cac10_AAACCTGGTACGAAAT,CD4+ T cells,0.600564
B_cac10_AAACCTGGTGAAAGAG,Intermediate,0.523409
B_cac10_AAACCTGTCACGATGT,Pro-inflammatory,0.417229
...,...,...
T_cac9_TTTGGTTCAAATACAG,Unknown,0.288446
T_cac9_TTTGGTTCAACACGCC,Regulatory T cells,0.925800
T_cac9_TTTGGTTCATAGACTC,CD4+ T cells,0.986861
T_cac9_TTTGGTTCATCTATGG,cDC,0.932307


In [12]:
adata = sc.concat(adatas)
adata

AnnData object with n_obs √ó n_vars = 38228 √ó 23828
    obs: 'samples', 'condition', 'location', 'msi_status', 'bulk_prediction', 'prediction', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'

In [13]:
predictions.to_csv('PREDICTIONS_200997.csv')

In [14]:
adata.obs = adata.obs.merge(right = predictions, left_index=True, right_index=True)

In [15]:
adata.obs

Unnamed: 0,samples,condition,location,msi_status,bulk_prediction,prediction,n_genes,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,pct_counts_mt,pct_counts_ribo,pct_counts_hb,low_label,low_score
B_cac10_AAACCTGCACTTCGAA,B_cac10,Normal,Left,MSS,,,626,626,6.440947,1817.0,7.505492,26.637314,2.146395,40.286186,0.0,CD19+CD20+ B,1.000000
B_cac10_AAACCTGGTAATTGGA,B_cac10,Normal,Left,MSS,,,741,741,6.609349,2041.0,7.621685,24.595786,4.899559,40.764332,0.0,CD4+ T cells,0.889012
B_cac10_AAACCTGGTACGAAAT,B_cac10,Normal,Left,MSS,,,630,630,6.447306,2217.0,7.704361,29.048263,6.630582,50.338291,0.0,CD4+ T cells,0.600564
B_cac10_AAACCTGGTGAAAGAG,B_cac10,Normal,Left,MSS,,,1053,1053,6.960348,2708.0,7.904335,25.553914,3.618907,27.437222,0.0,Intermediate,0.523409
B_cac10_AAACCTGTCACGATGT,B_cac10,Normal,Left,MSS,,,335,335,5.817111,714.0,6.572283,33.473389,8.123250,38.935574,0.0,Pro-inflammatory,0.417229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T_cac9_TTTGGTTCAAATACAG,T_cac9,Tumor,Left,MSS,CMS2,CMS1,2136,2136,7.667158,5317.0,8.578853,17.359413,7.504231,13.673123,0.0,Unknown,0.288446
T_cac9_TTTGGTTCAACACGCC,T_cac9,Tumor,Left,MSS,CMS2,CMS4,1355,1355,7.212294,3463.0,8.150179,19.318510,4.244874,24.631823,0.0,Regulatory T cells,0.925800
T_cac9_TTTGGTTCATAGACTC,T_cac9,Tumor,Left,MSS,CMS2,CMS4,2220,2220,7.705713,7173.0,8.878219,19.001812,3.987174,27.798689,0.0,CD4+ T cells,0.986861
T_cac9_TTTGGTTCATCTATGG,T_cac9,Tumor,Left,MSS,CMS2,CMS1,1303,1303,7.173192,3283.0,8.096817,27.231191,8.010965,23.484617,0.0,cDC,0.932307


In [16]:
adata.write_h5ad('200997_annotated.h5ad')