In [2]:
import anndata
import scanpy as sc
import numpy as np
import os

BATCH_LABELS = ("control", "TGFB1", "BMP9", "BMP9-TGFB1")



In [3]:
combined_data = anndata.read("write/tgfb1-2.h5ad")

In [4]:
alk5_sig = ['SERPINE1', 'CTGF', 'SMAD7', 'COMP', 'NOX4', 'PMEPA4', 'RUNX1', 'PIGF', 'TAGLN', 'IL6', 'IL11', 'GJA4', 'LTBP1']
fibro_sig = ['S100A4', 'VIM', 'COL1A1', 'COL3A1', 'TSLP', 'CXCL14', 'SERPINH1']
smc_sig = ['ACTA2', 'CNN1', 'TAGLN', 'MYL9', 'MYH11', 'TAGLN2', 'MYOCDN', 'SMTN', 'SMAO', 'DES', 'LMOD1', 'P2X1', 'CRP1', 'CRP2', 'VCL', 'CALM1', 'MYLK', 'PP1', 'MYPT1', 'PPP1R12A']
alk1_sig = ['ID1', 'ID2', 'ID3', 'ID4', 'HAMP', 'PLAUR', 'SMAD6', 'HMGA2', 'IL8', 'EDN1', 'CXCL1', 'BMP6', 'HO1', 'JAG1', 'PTHLH', 'TLR4', 'RHOB', 'MAP3K5', 'ENG', 'COL5A1', 'HSP47', 'DDIT3', 'DNAJB1', 'HERPUD1', 'NEDD9', 'CRYAB', 'TPM1', 'LENG4', 'KPNA3', 'ANKRD15', 'SQLE', 'KDELR3', 'SCARA3', 'STAT1', 'IL1R1']
osteo_sig = ['RUNX2', 'OSX', 'ALPL', 'BSP', 'BGLAP', 'COL1A1', 'MSX2', 'SOX9', 'SOX5', 'DLX5', 'PIT1', 'PIT2', 'SOST']
angii_sig = ["AGTR1", "AGTR2", "MAS1"]
ca_handling_sig = ["ITPR1", "ITPR2", "ITPR3", "RYR1", "RYR2", "RYR3", "ATP2A1", "ATP2A2", "ATP2A3"]
apoptosis_sig = ['CASP1', 'CASP2', 'CASP3', 'CASP4', 'CASP5', 'CASP6', 'CASP7', 'CASP8', 'CASP9', 'CASP10', 'CASP11', 'CASP12', 'CASP12', 'CASP13', 'CASP14', 'TP53', 'BCL2']
SIGNATURES = {"alk5 signature": alk5_sig, "alk1 signature": alk1_sig, "fibro signature": fibro_sig,
              "smc signature": smc_sig, "osteo signature": osteo_sig,
              "ca2+ signature": ca_handling_sig, "apoptosis signature": apoptosis_sig}

In [5]:
combined_data

AnnData object with n_obs × n_vars = 6189 × 4606 
    obs: 'batch', 'cellular_barcode', 'n_counts', 'n_genes', 'percent_mito'
    var: 'gene_names', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'genes_before_highly_variable', 'genes_original'
    layers: 'ambiguous', 'spliced', 'unspliced'

In [6]:
def list_all_genes(anndata_matrix, cluster_no):
    log_fold_changes = np.array(anndata_matrix.uns['rank_genes_groups']['logfoldchanges'].tolist()).T[cluster_no].tolist()
    fold_changes = [i**2 for i in log_fold_changes]
    fdr = np.array(anndata_matrix.uns['rank_genes_groups']['pvals_adj'].tolist()).T[cluster_no].tolist()
    names = np.array(anndata_matrix.uns['rank_genes_groups']['names'].tolist()).T[cluster_no].tolist()
    genes = list(zip(names, fold_changes, fdr))
    return genes

In [7]:
def compare(case, control, number_of_genes, just_genes = False):
    print("\nComparing {} with {}, outputting {} genes".format(case, control, number_of_genes))
    sc.tl.rank_genes_groups(combined_data, "batch", n_genes=-1, groups = [case], reference = control, method="wilcoxon")
    all_genes = list_all_genes(combined_data, 0)
    print("Upregulated genes (gene, fold change, p-value):")
    for gene in all_genes[:number_of_genes]:
        if just_genes:
            print(gene[0])
        else:
            print(gene)
    print("Downregulated genes (gene, fold change, p-value):")
    for gene in all_genes[-number_of_genes:]:
        if just_genes:
            print(gene[0])
        else:
            print(gene)

In [8]:
def compare_signatures(case, control, genes_in_signature):
    
    print("\nComparing {} with {}, looking only at: {} genes".format(case, control, genes_in_signature))
    sc.tl.rank_genes_groups(combined_data, "batch", n_genes=-1, groups = [case], reference = control, method="wilcoxon")
    all_genes = list_all_genes(combined_data, 0)
    genes_to_stats = {i:(j, k) for i, j, k in all_genes}
    for gene in genes_in_signature:
        try:
            print(gene, genes_to_stats[gene])
        except KeyError:
            print("gene not present", gene)

You can change the number to get more/ less genes, and change just_genes=False to True if you want only genes names

# This lists top up/downregulated genes for bulkified data

In [9]:
for case, control in [["TGFB1", "control"], ["BMP9", "control"], ["BMP9-TGFB1", "control"], ["BMP9-TGFB1", "TGFB1"], ["BMP9-TGFB1", "BMP9"]]:
    compare(case, control, 50, just_genes=False)


Comparing TGFB1 with control, outputting 50 genes
Upregulated genes (gene, fold change, p-value):
('HSPA5', 14.558324233549286, 0.0)
('TNFRSF12A', 18.774712899559972, 0.0)
('SH3PXD2A', 16.47223741565631, 0.0)
('MICAL2', 29.315320529595056, 0.0)
('PMEPA1', 48.415412207265035, 0.0)
('SERPINE2', 15.989242455169972, 0.0)
('PLOD2', 22.013663989528368, 0.0)
('VCAN', 10.917544985762163, 0.0)
('TPM1', 35.111374074733476, 0.0)
('GARS', 11.339302010060294, 0.0)
('COMP', 56.587511817963104, 0.0)
('SPARC', 13.697665533889051, 0.0)
('LTBP2', 13.002456704432973, 0.0)
('SLC3A2', 13.247545789618528, 0.0)
('TNFAIP6', 38.80230746032703, 0.0)
('PSAT1', 35.49693661593574, 0.0)
('BMPR2', 9.478653938615537, 0.0)
('SLC7A11', 28.97755034718307, 0.0)
('IARS', 10.478258450370959, 0.0)
('COL8A1', 30.912459351952748, 0.0)
('B4GALT1', 11.087886455427224, 0.0)
('UNC5B', 37.427357202454004, 0.0)
('ITGB1', 3.0227731240507296, 0.0)
('MTHFD2', 21.72917304767998, 0.0)
('PLXDC2', 46.529346958123824, 0.0)
('ACTN1', 8.887

Upregulated genes (gene, fold change, p-value):
('HSPA5', 13.344634554305458, 0.0)
('PMEPA1', 52.40588023127407, 0.0)
('COMP', 69.84919437010922, 0.0)
('VCAN', 12.250467304871563, 0.0)
('TPM1', 36.05203791085137, 0.0)
('PLOD2', 23.139012396335602, 0.0)
('TNFRSF12A', 17.3101614089328, 0.0)
('GARS', 12.1529228415489, 0.0)
('SLC3A2', 14.489022649955189, 0.0)
('PSAT1', 38.74551848546639, 0.0)
('SLC7A11', 31.450608414621684, 0.0)
('BMPR2', 12.007906445146602, 0.0)
('SERPINE2', 13.916133829706268, 0.0)
('B4GALT1', 13.247075459011285, 0.0)
('SPARC', 13.730519153334399, 0.0)
('SH3PXD2A', 15.83243983124163, 0.0)
('MTHFD2', 23.155241094566236, 0.0)
('PLXDC2', 50.78060212912169, 0.0)
('UNC5B', 39.83974627175576, 0.0)
('HDGFL3', 19.40023358899998, 0.0)
('ID3', 39.593708204947916, 0.0)
('TNFAIP6', 36.900118995050434, 0.0)
('LTBP2', 12.097573101920943, 0.0)
('SLC7A5', 40.777904252769304, 0.0)
('SCX', 85.02449665120002, 0.0)
('GOLIM4', 11.200737298790045, 0.0)
('MICAL2', 24.785741273467465, 0.0)
('IA

Upregulated genes (gene, fold change, p-value):
('RPL41', 2.7088216430019116, 0.0)
('HSPA5', 11.681915501254991, 0.0)
('RPS27', 3.1105477335000273, 0.0)
('RPL34', 2.3259958021613016, 0.0)
('SEC61G', 13.447686761705938, 0.0)
('PMEPA1', 32.94252855973264, 0.0)
('MT-ND3', 3.230449629344548, 0.0)
('GARS', 11.944721056356741, 0.0)
('PSAT1', 45.78483172599226, 0.0)
('PLOD2', 22.445046615780484, 0.0)
('TNFRSF12A', 13.143688141973144, 0.0)
('VCAN', 8.425804012348692, 0.0)
('MTHFD2', 22.60974040178212, 0.0)
('SH3PXD2A', 16.054684554265123, 0.0)
('SLC3A2', 11.325794105707473, 0.0)
('HDGFL3', 16.675551762355326, 0.0)
('DGKI', 16.166783705251873, 0.0)
('ACTB', 3.4859974196333496, 0.0)
('IARS', 9.304300285647571, 0.0)
('EIF4EBP1', 17.602651511579325, 0.0)
('SPARC', 13.978454656957183, 0.0)
('COL8A1', 29.121549548814073, 0.0)
('PLXDC2', 35.967471798147926, 0.0)
('TPM1', 20.27264417090032, 0.0)
('MICAL2', 23.217827772533383, 0.0)
('COMP', 33.51116881008784, 0.0)
('TSPAN2', 56.77237110421993, 0.0)
('C

In [10]:
for case, control in [["TGFB1", "control"], ["BMP9", "control"], ["BMP9-TGFB1", "control"], ["BMP9-TGFB1", "TGFB1"], ["BMP9-TGFB1", "BMP9"]]:
    for signature in SIGNATURES.values():
        compare_signatures(case, control, signature)


Comparing TGFB1 with control, looking only at: ['SERPINE1', 'CTGF', 'SMAD7', 'COMP', 'NOX4', 'PMEPA4', 'RUNX1', 'PIGF', 'TAGLN', 'IL6', 'IL11', 'GJA4', 'LTBP1'] genes
SERPINE1 (11.41685921746921, 0.0)
CTGF (16.250030829361094, 0.0)
SMAD7 (5.708034986512359, 6.318319454579118e-190)
COMP (56.587511817963104, 0.0)
NOX4 (32.22014947105913, 0.0)
gene not present PMEPA4
RUNX1 (6.732104852340456, 0.0)
gene not present PIGF
TAGLN (11.98987609559623, 9.199522787966049e-35)
IL6 (1.5422193001081865, 1.404683831002279e-33)
IL11 (78.730977769942, 0.0)
gene not present GJA4
LTBP1 (6.644761707866564, 0.0)

Comparing TGFB1 with control, looking only at: ['ID1', 'ID2', 'ID3', 'ID4', 'HAMP', 'PLAUR', 'SMAD6', 'HMGA2', 'IL8', 'EDN1', 'CXCL1', 'BMP6', 'HO1', 'JAG1', 'PTHLH', 'TLR4', 'RHOB', 'MAP3K5', 'ENG', 'COL5A1', 'HSP47', 'DDIT3', 'DNAJB1', 'HERPUD1', 'NEDD9', 'CRYAB', 'TPM1', 'LENG4', 'KPNA3', 'ANKRD15', 'SQLE', 'KDELR3', 'SCARA3', 'STAT1', 'IL1R1'] genes
ID1 (2.3645046010712036, 3.1782204703617144e

ACTA2 (0.1284494400024414, 0.01806280552954199)
CNN1 (2.488704416309929, 0.6775293253018126)
TAGLN (0.10996131423237276, 0.8967774386465639)
MYL9 (0.7274408983212197, 9.17970556220112e-39)
MYH11 (0.002770129415990974, 0.8189767253852552)
gene not present TAGLN2
gene not present MYOCDN
gene not present SMTN
gene not present SMAO
DES (0.11698441128221848, 0.11365369177004064)
LMOD1 (2.041162826686346, 2.6123784637775133e-25)
gene not present P2X1
gene not present CRP1
gene not present CRP2
VCL (0.010337998806521365, 0.04013561057508489)
gene not present CALM1
MYLK (0.002051391358159016, 0.6953368700158771)
gene not present PP1
gene not present MYPT1
gene not present PPP1R12A

Comparing BMP9 with control, looking only at: ['RUNX2', 'OSX', 'ALPL', 'BSP', 'BGLAP', 'COL1A1', 'MSX2', 'SOX9', 'SOX5', 'DLX5', 'PIT1', 'PIT2', 'SOST'] genes
RUNX2 (0.4521573470311182, 3.4498941955361453e-10)
gene not present OSX
ALPL (0.2780321528131289, 0.19181069156949432)
gene not present BSP
gene not present B

SERPINE1 (0.06595410565536497, 1.5311866119711724e-05)
CTGF (0.35359192901614733, 3.044141260602299e-18)
SMAD7 (0.7862541802106229, 6.971260971049214e-92)
COMP (0.6974188810883994, 5.938551771586258e-78)
NOX4 (0.2894675245353575, 3.58705926251242e-24)
gene not present PMEPA4
RUNX1 (0.00010139598755328025, 0.7032875304153879)
gene not present PIGF
TAGLN (0.11684233968384561, 0.07949438462591822)
IL6 (0.6909279286478665, 3.379175064558955e-23)
IL11 (0.4388993317161436, 5.137488994339174e-24)
gene not present GJA4
LTBP1 (0.00013158765571882978, 0.7601103553500455)

Comparing BMP9-TGFB1 with TGFB1, looking only at: ['ID1', 'ID2', 'ID3', 'ID4', 'HAMP', 'PLAUR', 'SMAD6', 'HMGA2', 'IL8', 'EDN1', 'CXCL1', 'BMP6', 'HO1', 'JAG1', 'PTHLH', 'TLR4', 'RHOB', 'MAP3K5', 'ENG', 'COL5A1', 'HSP47', 'DDIT3', 'DNAJB1', 'HERPUD1', 'NEDD9', 'CRYAB', 'TPM1', 'LENG4', 'KPNA3', 'ANKRD15', 'SQLE', 'KDELR3', 'SCARA3', 'STAT1', 'IL1R1'] genes
ID1 (18.010321446165335, 7.486722590854984e-265)
ID2 (4.830528853636679,

ACTA2 (0.3004941067067648, 0.0045825121306914875)
CNN1 (40.4313409343049, 2.1706836296278591e-75)
TAGLN (11.919218646770787, 4.749708228822015e-25)
MYL9 (0.5112510446289242, 9.854007676707957e-25)
MYH11 (3.307649583621597, 0.00020162736386085883)
gene not present TAGLN2
gene not present MYOCDN
gene not present SMTN
gene not present SMAO
DES (1.7364301288087205, 3.420711840561432e-07)
LMOD1 (0.6438953938152601, 3.479431808650386e-05)
gene not present P2X1
gene not present CRP1
gene not present CRP2
VCL (0.11348534230365104, 9.048509934151024e-12)
gene not present CALM1
MYLK (0.8081663650160777, 1.1036770292710024e-27)
gene not present PP1
gene not present MYPT1
gene not present PPP1R12A

Comparing BMP9-TGFB1 with BMP9, looking only at: ['RUNX2', 'OSX', 'ALPL', 'BSP', 'BGLAP', 'COL1A1', 'MSX2', 'SOX9', 'SOX5', 'DLX5', 'PIT1', 'PIT2', 'SOST'] genes
RUNX2 (5.984301015304538, 4.513398582223894e-175)
gene not present OSX
ALPL (0.3869062186722658, 0.25444335116476835)
gene not present BSP
gen