# Compare p-values between propeller and scanpro implementations

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Scanpro functions
from scanpro import scanpro

#Setup path to R for propeller
R_home = os.path.dirname(sys.executable)[:-4] + "/lib/R"
os.environ["R_HOME"] = R_home
%load_ext rpy2.ipython

In [2]:
def convert_counts_to_df(counts, prop_cols=None, meta_cols=None, n_cells=1, column_name="Cluster"):
    """ Convert a cell count matrix to a dataframe in long format."""

    counts = counts.copy()

    #If not given, try to get prop_cols and meta_cols automatically
    if prop_cols is None:
        dtypes = counts.dtypes.astype(str)
        prop_cols = [col for i, col in enumerate(counts.columns) if "float" in dtypes[i] or "int" in dtypes[i]]
        
    if meta_cols is None:
        meta_cols = [col for col in counts.columns if col not in prop_cols]

    # Multiply proportions with n_cells
    counts[prop_cols] *= n_cells
    counts[prop_cols] = counts[prop_cols].astype(int)
    
    # Melt into long format (similar to adata.obs)
    counts_melt = pd.melt(counts, id_vars=meta_cols, value_vars=prop_cols, 
                          var_name=column_name, value_name="count")

    # Duplicate rows based on number of cells
    counts_long = counts_melt.loc[counts_melt.index.repeat(counts_melt["count"])].reset_index(drop=True)
    counts_long.drop(columns="count", inplace=True)
    counts_long.index = ["cell_" + str(i) for i in range(1, len(counts_long) + 1)]
    
    return counts_long

--------

## PBMC - 2 conditions

In [3]:
pbmc_counts = pd.read_csv("data/pbmc_counts.tsv", sep="\t")
pbmc = convert_counts_to_df(pbmc_counts, column_name="Celltype")

In [4]:
scanpro_out = scanpro.scanpro(pbmc, samples_col='Sample', clusters_col='Celltype', conds_col='Sex')

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


In [5]:
%%R -i pbmc -o propeller_results
library(speckle)

propeller_results = propeller(clusters = pbmc$Celltype, sample = pbmc$Sample, group = pbmc$Sex)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)
Performing logit transformation of proportions
group variable has 2 levels, t-tests will be performed


In [6]:
scanpro_out.results

Unnamed: 0_level_0,baseline_props,mean_props_female,mean_props_male,prop_ratio,t_statistics,p_values,adjusted_p_values
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BC,0.098383,0.101042,0.095581,1.057138,0.453598,0.65121,0.814013
DC,0.017112,0.015587,0.018793,0.829422,-1.054917,0.294287,0.490479
MC,0.171416,0.168821,0.171046,0.986995,-0.063599,0.94943,0.94943
NK,0.146479,0.116914,0.174684,0.669285,-2.636203,0.009874,0.049371
TC,0.56661,0.597636,0.539896,1.106946,1.44769,0.151178,0.377946


In [7]:
propeller_results = propeller_results.sort_index()
propeller_results

Unnamed: 0,BaselineProp.clusters,BaselineProp.Freq,PropMean.female,PropMean.male,PropRatio,Tstatistic,P.Value,FDR
BC,BC,0.098383,0.101042,0.095581,1.057138,0.453546,0.651248,0.814059
DC,DC,0.017112,0.015587,0.018793,0.829422,-1.052536,0.295371,0.492285
MC,MC,0.171416,0.168821,0.171046,0.986995,-0.063694,0.949355,0.949355
NK,NK,0.146479,0.116914,0.174684,0.669285,-2.637317,0.009844,0.049221
TC,TC,0.56661,0.597636,0.539896,1.106946,1.44842,0.150975,0.377437


In [8]:
scanpro_out.results.iloc[:,-1] - propeller_results.iloc[:,-1]

Celltype
BC   -0.000047
DC   -0.001807
MC    0.000075
NK    0.000150
TC    0.000509
dtype: float64

--------

## PBMC with covariates

In [9]:
scanpro_out = scanpro.scanpro(pbmc, samples_col='Sample', clusters_col='Celltype', conds_col='Sex', covariates="Age")

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


In [10]:
scanpro_out.results

Unnamed: 0_level_0,baseline_props,mean_props_female,mean_props_male,prop_ratio,t_statistics,p_values,adjusted_p_values
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BC,0.098383,0.101042,0.095581,1.057138,0.466193,0.64227,0.802838
DC,0.017112,0.015587,0.018793,0.829422,-1.094385,0.276878,0.461463
MC,0.171416,0.168821,0.171046,0.986995,-0.06602,0.947517,0.947517
NK,0.146479,0.116914,0.174684,0.669285,-2.699387,0.00838,0.0419
TC,0.56661,0.597636,0.539896,1.106946,1.496382,0.138258,0.345645


In [11]:
sample_info = pbmc[["Sample", "Condition", "Sex", "Age"]].drop_duplicates()

In [12]:
%%R -i pbmc -i sample_info -o propeller_result
library(limma)

sex <- sample_info$Sex
age <- sample_info$Age

design <- model.matrix(~ 0 + sex + age)
props <- getTransformedProps(pbmc$Celltype, pbmc$Sample, transform="logit")

mycontr <- makeContrasts("sexmale-sexfemale", levels=design)

propeller_result = propeller.ttest(props, design, contrasts=mycontr, robust=TRUE, trend=FALSE, sort=TRUE)

Performing logit transformation of proportions


In [13]:
propeller_result = propeller_result.sort_index()
propeller_result

Unnamed: 0,PropMean.sexfemale,PropMean.sexmale,PropRatio,Tstatistic,P.Value,FDR
BC,0.101042,0.095581,0.94595,-0.465948,0.642444,0.803055
DC,0.015587,0.018793,1.205659,1.093562,0.277237,0.462061
MC,0.168821,0.171046,1.013177,0.066221,0.947357,0.947357
NK,0.116914,0.174684,1.494132,2.697454,0.008425,0.042124
TC,0.597636,0.539896,0.903387,-1.498186,0.13779,0.344474


In [14]:
scanpro_out.results.iloc[:,-1] - propeller_result.iloc[:,-1]

Celltype
BC   -0.000218
DC   -0.000598
MC    0.000160
NK   -0.000224
TC    0.001171
dtype: float64

----------------

## Heart - 3 conditions

In [15]:
heart_counts = pd.read_csv("data/heart_counts.tsv", sep="\t")
heart = convert_counts_to_df(heart_counts, column_name="Celltype")

In [16]:
scanpro_out = scanpro.scanpro(heart, samples_col='Sample',
                             clusters_col='Celltype', conds_col='Condition')

[INFO] There are more than 2 conditions. ANOVA will be performed...
[INFO] Done!


In [17]:
%%R -i heart -o propeller_results
library(speckle)

propeller_results = propeller(clusters = heart$Celltype, sample = heart$Sample, 
                              group = heart$Condition)

Performing logit transformation of proportions
group variable has > 2 levels, ANOVA will be performed


In [18]:
scanpro_out.results

Unnamed: 0_level_0,baseline_props,mean_props_adult,mean_props_fetal,mean_props_young,f_statistics,p_values,adjusted_p_values
Celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cardiomyocytes,0.549464,0.273547,0.68241,0.426761,8.998537,0.000124,0.00033
Endothelial cells,0.101607,0.124538,0.102108,0.092478,0.337551,0.713515,0.713515
Epicardial cells,0.064167,0.093158,0.051415,0.07541,0.901016,0.406157,0.541542
Erythroid,0.002272,0.0,0.004433,0.0,197.947461,0.0,0.0
Fibroblast,0.182102,0.298689,0.111342,0.261924,4.602496,0.010027,0.020054
Immune cells,0.076302,0.189588,0.027546,0.108751,11.508649,1e-05,4e-05
Neurons,0.016143,0.011381,0.012643,0.02621,1.545816,0.213138,0.341021
Smooth muscle cells,0.007942,0.009099,0.008102,0.008465,0.403828,0.667759,0.713515


In [19]:
propeller_results = propeller_results.sort_index()
propeller_results

Unnamed: 0,BaselineProp,PropMean.adult,PropMean.fetal,PropMean.young,Fstatistic,P.Value,FDR
Cardiomyocytes,0.549464,0.273547,0.68241,0.426761,9.068229,0.0001152704,0.0003073879
Endothelial cells,0.101607,0.124538,0.102108,0.092478,0.340093,0.711704,0.711704
Epicardial cells,0.064167,0.093158,0.051415,0.07541,0.909787,0.4026099,0.5368133
Erythroid,0.002272,-0.0,0.004433,-0.0,46.591998,5.825757e-21,4.660605e-20
Fibroblast,0.182102,0.298689,0.111342,0.261924,4.630794,0.009747019,0.01949404
Immune cells,0.076302,0.189588,0.027546,0.108751,11.575093,9.397256e-06,3.758902e-05
Neurons,0.016143,0.011381,0.012643,0.02621,1.519245,0.218877,0.3502033
Smooth muscle cells,0.007942,0.009099,0.008102,0.008465,0.421692,0.6559362,0.711704


In [20]:
scanpro_out.results.iloc[:,-1] - propeller_results.iloc[:,-1]

Celltype
Cardiomyocytes         2.218666e-05
Endothelial cells      1.811479e-03
Epicardial cells       4.728979e-03
Erythroid             -4.660605e-20
Fibroblast             5.595227e-04
Immune cells           2.582390e-06
Neurons               -9.182506e-03
Smooth muscle cells    1.811479e-03
dtype: float64

-----------

## Human MI - 2 conditions

In [21]:
human_MI_counts = pd.read_csv("data/human_MI_counts.tsv", sep="\t")
human_MI = convert_counts_to_df(human_MI_counts, column_name="cell_type")

In [22]:
scanpro_out = scanpro.scanpro(human_MI, samples_col='sample',
                            clusters_col='cell_type', conds_col='major_label', 
                            conditions=['CTRL','IZ'])

scanpro_out.results

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


Unnamed: 0_level_0,baseline_props,mean_props_CTRL,mean_props_IZ,prop_ratio,t_statistics,p_values,adjusted_p_values
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.000407,0.000355,0.002084,0.170583,0.01917,0.984957,0.984957
Cardiomyocyte,0.120837,0.442471,0.152491,2.901614,2.754764,0.01467,0.080683
Cycling cells,0.013488,0.005429,0.063373,0.08566,-5.800573,3.4e-05,0.000374
Endothelial,0.065925,0.146923,0.207,0.709772,-1.280809,0.219574,0.301915
Fibroblast,0.103209,0.225862,0.271549,0.831755,-0.658671,0.52002,0.572022
Lymphoid,0.0095,0.012775,0.043002,0.29709,-2.238372,0.040671,0.09519
Mast,0.001449,0.003495,0.003041,1.149458,1.301492,0.212588,0.301915
Myeloid,0.050434,0.074778,0.185137,0.403904,-2.476843,0.025553,0.093696
Neuronal,0.004009,0.013734,0.007297,1.882211,2.206098,0.043268,0.09519
Pericyte,0.022853,0.064351,0.048875,1.31665,1.394666,0.183286,0.301915


In [23]:
human_MI_sub = human_MI[human_MI["major_label"].isin(["CTRL", "IZ"])]

In [24]:
scanpro_out = scanpro.scanpro(human_MI_sub, samples_col='sample',
                            clusters_col='cell_type', conds_col='major_label')

scanpro_out.results

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


Unnamed: 0_level_0,baseline_props,mean_props_CTRL,mean_props_IZ,prop_ratio,t_statistics,p_values,adjusted_p_values
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.001024,0.000355,0.002084,0.170583,0.01917,0.984957,0.984957
Cardiomyocyte,0.304383,0.442471,0.152491,2.901614,2.754764,0.01467,0.080683
Cycling cells,0.033976,0.005429,0.063373,0.08566,-5.800573,3.4e-05,0.000374
Endothelial,0.16606,0.146923,0.207,0.709772,-1.280809,0.219574,0.301915
Fibroblast,0.259978,0.225862,0.271549,0.831755,-0.658671,0.52002,0.572022
Lymphoid,0.023929,0.012775,0.043002,0.29709,-2.238372,0.040671,0.09519
Mast,0.003651,0.003495,0.003041,1.149458,1.301492,0.212588,0.301915
Myeloid,0.127041,0.074778,0.185137,0.403904,-2.476843,0.025553,0.093696
Neuronal,0.0101,0.013734,0.007297,1.882211,2.206098,0.043268,0.09519
Pericyte,0.057564,0.064351,0.048875,1.31665,1.394666,0.183286,0.301915


In [25]:
%%R -i human_MI_sub -o propeller_results
library(speckle)

propeller_results = propeller(clusters = human_MI_sub$cell_type, sample = human_MI_sub$sample, 
                               group = human_MI_sub$major_label)

Performing logit transformation of proportions
group variable has 2 levels, t-tests will be performed


In [26]:
propeller_results = propeller_results.sort_index().iloc[:,1:]
propeller_results

Unnamed: 0,BaselineProp.Freq,PropMean.CTRL,PropMean.IZ,PropRatio,Tstatistic,P.Value,FDR
Adipocyte,0.001024,0.000355,0.002084,0.170583,-1.442505,0.16769,0.307432
Cardiomyocyte,0.304383,0.442471,0.152491,2.901614,2.862045,0.010955,0.060253
Cycling cells,0.033976,0.005429,0.063373,0.08566,-5.845425,1.7e-05,0.000185
Endothelial,0.16606,0.146923,0.207,0.709772,-1.198902,0.246433,0.338846
Fibroblast,0.259978,0.225862,0.271549,0.831755,-0.633622,0.534463,0.534463
Lymphoid,0.023929,0.012775,0.043002,0.29709,-2.281753,0.035166,0.092934
Mast,0.003651,0.003495,0.003041,1.149458,1.052531,0.307578,0.34767
Myeloid,0.127041,0.074778,0.185137,0.403904,-2.459512,0.024498,0.089827
Neuronal,0.0101,0.013734,0.007297,1.882211,2.189828,0.042243,0.092934
Pericyte,0.057564,0.064351,0.048875,1.31665,1.274589,0.218999,0.338846


In [27]:
scanpro_out.results.iloc[:,-1] - propeller_results.iloc[:,-1]

cell_type
Adipocyte        0.677524
Cardiomyocyte    0.020430
Cycling cells    0.000189
Endothelial     -0.036931
Fibroblast       0.037560
Lymphoid         0.002256
Mast            -0.045755
Myeloid          0.003869
Neuronal         0.002256
Pericyte        -0.036931
vSMCs            0.051037
dtype: float64

--------------

## Human MI - 3 conditions

In [28]:
scanpro_out = scanpro.scanpro(human_MI, samples_col='sample',
                             clusters_col='cell_type', conds_col='major_label', conditions=["CTRL", "IZ", "FZ"])

scanpro_out.results

[INFO] There are more than 2 conditions. ANOVA will be performed...
[INFO] Done!


Unnamed: 0_level_0,baseline_props,mean_props_CTRL,mean_props_FZ,mean_props_IZ,f_statistics,p_values,adjusted_p_values
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.001611,0.000355,0.007654,0.002084,1.623507,0.1972058,0.3098949
Cardiomyocyte,0.160458,0.442471,0.160938,0.152491,3.932092,0.01960262,0.05390722
Cycling cells,0.014166,0.005429,0.004619,0.063373,35.222196,5.551115e-16,6.106227e-15
Endothelial,0.106718,0.146923,0.244929,0.207,1.037603,0.354303,0.433037
Fibroblast,0.154775,0.225862,0.286012,0.271549,0.46395,0.6287953,0.6287953
Lymphoid,0.017091,0.012775,0.050783,0.043002,1.886445,0.1516098,0.2779513
Mast,0.002763,0.003495,0.007942,0.003041,0.791893,0.4529866,0.4982852
Myeloid,0.077494,0.074778,0.148286,0.185137,2.265912,0.1037354,0.2282178
Neuronal,0.006132,0.013734,0.012309,0.007297,4.372135,0.01262425,0.04628893
Pericyte,0.033166,0.064351,0.047683,0.048875,1.090112,0.3361788,0.433037


In [29]:
human_MI_sub = human_MI[human_MI["major_label"].isin(["CTRL", "IZ", "FZ"])]

In [30]:
%%R -i human_MI_sub -o propeller_results
library(speckle)

propeller_results = propeller(clusters = human_MI_sub$cell_type, sample = human_MI_sub$sample, 
                              group = human_MI_sub$major_label)

Performing logit transformation of proportions
group variable has > 2 levels, ANOVA will be performed


In [31]:
propeller_results = propeller_results.sort_index()
propeller_results

Unnamed: 0,BaselineProp,PropMean.CTRL,PropMean.FZ,PropMean.IZ,Fstatistic,P.Value,FDR
Adipocyte,0.002758,0.000355,0.007654,0.002084,2.590385,0.09853004,0.216766
Cardiomyocyte,0.274671,0.442471,0.160938,0.152491,4.084748,0.03159201,0.086878
Cycling cells,0.02425,0.005429,0.004619,0.063373,34.910101,1.456055e-07,2e-06
Endothelial,0.18268,0.146923,0.244929,0.207,0.990286,0.3873776,0.481172
Fibroblast,0.264943,0.225862,0.286012,0.271549,0.430831,0.6553169,0.655317
Lymphoid,0.029257,0.012775,0.050783,0.043002,1.967866,0.163513,0.256949
Mast,0.00473,0.003495,0.007942,0.003041,0.535401,0.5931482,0.652463
Myeloid,0.132654,0.074778,0.148286,0.185137,2.29794,0.1239966,0.227327
Neuronal,0.010496,0.013734,0.012309,0.007297,4.13984,0.02973166,0.086878
Pericyte,0.056773,0.064351,0.047683,0.048875,0.972694,0.3936866,0.481172


In [32]:
scanpro_out.results.iloc[:,-1] - propeller_results.iloc[:,-1]

cell_type
Adipocyte        0.093129
Cardiomyocyte   -0.032971
Cycling cells   -0.000002
Endothelial     -0.048135
Fibroblast      -0.026522
Lymphoid         0.021002
Mast            -0.154178
Myeloid          0.000891
Neuronal        -0.040589
Pericyte        -0.048135
vSMCs           -0.040589
dtype: float64