In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import nnls
pd.set_option('display.max_columns', 50)
from IPython.display import display, HTML, Javascript
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# Comment/uncomment (# prefix) one of the following blocks depending on your source files format

# User-specified files and tab (if MS Excel files)
# Here all tables are in various tabs of the same file but you can specify different files
source_file_format = "xlsx"
signature_file = "./data/signature_415&915_with_correction_factors.xlsx"
signature_tab = "signature_915"
samples_counts_file = "./data/signature_415&915_with_correction_factors.xlsx"
sample_counts_tab = "pbmc"
correction_factors_file = "./data/signature_415&915_with_correction_factors.xlsx"
correction_factors_tab = "signature_915_correction_factor"

# User-specified files (if csv files) -- IMPORTANT: Be careful to properly set csv fields separator and decimal separator accordingly to your files content
#source_file_format = "csv"
#csv_separator = ";"
#decimal_separator = ","
#signature_file = "./data/signature_915.csv"
#samples_counts_file = "./data/pbmc.csv"
#correction_factors_file = "./data/signature_915_correction_factor.csv"

In [3]:
# 1) Put the content of the ‘signature’ file into a signature dataframe
if source_file_format == "xlsx":
    signature = pd.read_excel(signature_file, signature_tab)
else:
    signature = pd.read_csv(signature_file, sep=csv_separator)
groups = list(signature.columns)[1:]
display(signature)

Unnamed: 0,Gene,B Memory,B Naive,Basophils LD,CD4+ effector,CD8 activated,MAIT,mDCs,Monocytes C,Monocytes NC+I,Naive T cells,Neutrophils LD,NK,pDCs,Plasmablasts,VD2-,VD2+
0,38596,1,1,0,0,0,0,0,0,0,0,12,0,1,0,0,0
1,AATK,0,0,0,0,0,0,0,2,1,0,18,1,0,0,0,0
2,ABC7-481722F1.1,0,0,0,0,0,0,1,5,2,0,17,0,0,0,0,0
3,ABCB4,9,41,0,0,1,0,5,0,0,0,0,1,1,0,0,0
4,ABCC3,4,6,5,3,2,2,4,14,65,4,3,4,3,1,2,1
5,AC002480.4,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,AC002480.5,2,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,AC002511.3,0,0,4,0,0,0,0,3,1,0,27,0,0,0,0,0
8,AC007381.3,1,2,0,0,0,0,0,0,0,0,0,0,16,0,0,0
9,AC008074.5,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0


In [4]:
# 2) Compute weighting: for each row of the signature dataframe, weight = 1 / max
weighting = []
for index, row in signature.iterrows():
    current_row = []
    for header in groups:
        current_row.append(row[header])
    weighting.append(1/max(current_row))
print(weighting)

[0.08333333333333333, 0.05555555555555555, 0.058823529411764705, 0.024390243902439025, 0.015384615384615385, 0.1, 0.0625, 0.037037037037037035, 0.0625, 0.07142857142857142, 0.07692307692307693, 0.04, 0.04, 0.022727272727272728, 0.04, 0.01282051282051282, 0.02040816326530612, 0.0026595744680851063, 0.05, 0.058823529411764705, 0.0043859649122807015, 0.02702702702702703, 0.03333333333333333, 0.016129032258064516, 0.05555555555555555, 0.006097560975609756, 0.0006993006993006993, 0.024390243902439025, 0.029411764705882353, 0.002544529262086514, 0.018518518518518517, 0.07692307692307693, 0.08333333333333333, 0.0017271157167530224, 0.004739336492890996, 0.0625, 0.008, 0.003125, 0.016129032258064516, 0.005263157894736842, 0.014925373134328358, 0.001440922190201729, 0.038461538461538464, 0.038461538461538464, 0.1, 0.05555555555555555, 0.0026954177897574125, 0.05555555555555555, 0.06666666666666667, 0.07692307692307693, 0.00847457627118644, 0.04, 0.0014144271570014145, 0.0009606147934678194, 0.0

In [5]:
# 3) Put the content of the ‘samples’ file to be deconvoluted into a sample dataframe
# This file have to contain genes in rows and samples in columns.
# First colum contains gene names
if source_file_format == "xlsx":
    samples_counts = pd.read_excel(samples_counts_file, sample_counts_tab)
else:
    samples_counts = pd.read_csv(samples_counts_file, sep=csv_separator, decimal=decimal_separator)
samples = list(samples_counts.columns)[1:]

# 4) Merge both samples and signature dataframes keeping only common genes, result being merged_table dataframe
merged_table = pd.merge(signature, samples_counts, how='inner', on=['Gene'])
display(merged_table)

Unnamed: 0,Gene,B Memory,B Naive,Basophils LD,CD4+ effector,CD8 activated,MAIT,mDCs,Monocytes C,Monocytes NC+I,Naive T cells,Neutrophils LD,NK,pDCs,Plasmablasts,VD2-,VD2+,CYFZ,FY2H,FLWA,453W,684C,CZJE,DZQV,925L,9JD4,G4YW,4DUY,36TS,CR3L
0,38596,1,1,0,0,0,0,0,0,0,0,12,0,1,0,0,0,6.3613,26.2231,26.2987,9.9612,9.9722,7.0004,9.0447,32.6301,30.9911,9.4478,13.8695,10.6430,18.4598
1,AATK,0,0,0,0,0,0,0,2,1,0,18,1,0,0,0,0,0.2878,0.4477,0.2278,0.6010,1.6820,0.5729,1.0374,0.8120,0.8833,1.9017,0.7610,1.2797,1.1826
2,ABC7-481722F1.1,0,0,0,0,0,0,1,5,2,0,17,0,0,0,0,0,1.6323,0.0000,0.0000,0.0000,0.0000,1.6889,2.1462,1.2154,0.7113,0.7292,0.0000,0.7469,0.0000
3,ABCB4,9,41,0,0,1,0,5,0,0,0,0,1,1,0,0,0,5.1184,0.9480,1.0784,1.5245,3.0407,6.3943,1.4214,6.3245,1.5337,1.5822,2.9022,1.9564,5.3627
4,ABCC3,4,6,5,3,2,2,4,14,65,4,3,4,3,1,2,1,19.8885,26.7250,15.1533,12.0063,21.6782,11.6971,11.5324,37.5237,16.4556,13.7068,20.1468,28.3676,18.6198
5,AC002480.4,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9663,0.8157,0.0000,0.0000,0.0000,0.0000,0.0000
6,AC002480.5,2,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.9709,0.0826,0.6328,1.0230,2.2298,0.4307,1.7057,1.6671,1.9210,0.0000,0.9543,0.7179,2.3251
7,AC002511.3,0,0,4,0,0,0,0,3,1,0,27,0,0,0,0,0,0.3507,0.3383,0.0000,0.0000,0.3373,0.0000,2.1915,0.4653,0.0000,0.0000,0.0000,0.3223,1.2171
8,AC007381.3,1,2,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0.0000,0.6376,0.0000,0.0000,0.5855,0.8954,0.4099,0.2527,0.3560,0.0000,0.0000,0.0000,0.7245
9,AC008074.5,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.3932,0.0000,0.0000,0.0000


In [6]:
# 5) Apply weighting: Multiply each value of each row by the weight from the weighting list (index of the weight is equal to index of the row)
# Result is a weighted merged signature/samples dataframe called weighted_merged_table
col_headers = list(merged_table.columns)
weighted_merged_table = pd.DataFrame(columns=col_headers)
for index, row in merged_table.iterrows():
    weighted_row = {}
    current_cell_is_gene = True # First iteration is on the column that contains genes
    for header in col_headers:
        if current_cell_is_gene:
            current_cell_is_gene = False
            weighted_row[header] = row[header]
            continue
        weighted_row[header] = row[header] * weighting[index]
    weighted_merged_table = weighted_merged_table.append(weighted_row, ignore_index=True)
display(weighted_merged_table)

Unnamed: 0,Gene,B Memory,B Naive,Basophils LD,CD4+ effector,CD8 activated,MAIT,mDCs,Monocytes C,Monocytes NC+I,Naive T cells,Neutrophils LD,NK,pDCs,Plasmablasts,VD2-,VD2+,CYFZ,FY2H,FLWA,453W,684C,CZJE,DZQV,925L,9JD4,G4YW,4DUY,36TS,CR3L
0,38596,0.083333,0.083333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.083333,0.000000,0.000000,0.000000,0.530108,2.185258,2.191558,0.830100,0.831017,0.583367,0.753725,2.719175,2.582592,0.787317,1.155792,0.886917,1.538317
1,AATK,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.111111,0.055556,0.000000,1.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.015989,0.024872,0.012656,0.033389,0.093444,0.031828,0.057633,0.045111,0.049072,0.105650,0.042278,0.071094,0.065700
2,ABC7-481722F1.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.058824,0.294118,0.117647,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.096018,0.000000,0.000000,0.000000,0.000000,0.099347,0.126247,0.071494,0.041841,0.042894,0.000000,0.043935,0.000000
3,ABCB4,0.219512,1.000000,0.000000,0.000000,0.024390,0.000000,0.121951,0.000000,0.000000,0.000000,0.000000,0.024390,0.024390,0.000000,0.000000,0.000000,0.124839,0.023122,0.026302,0.037183,0.074163,0.155959,0.034668,0.154256,0.037407,0.038590,0.070785,0.047717,0.130798
4,ABCC3,0.061538,0.092308,0.076923,0.046154,0.030769,0.030769,0.061538,0.215385,1.000000,0.061538,0.046154,0.061538,0.046154,0.015385,0.030769,0.015385,0.305977,0.411154,0.233128,0.184712,0.333511,0.179955,0.177422,0.577288,0.253163,0.210874,0.309951,0.436425,0.286458
5,AC002480.4,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.096630,0.081570,0.000000,0.000000,0.000000,0.000000,0.000000
6,AC002480.5,0.125000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.060681,0.005163,0.039550,0.063937,0.139363,0.026919,0.106606,0.104194,0.120063,0.000000,0.059644,0.044869,0.145319
7,AC002511.3,0.000000,0.000000,0.148148,0.000000,0.000000,0.000000,0.000000,0.111111,0.037037,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012989,0.012530,0.000000,0.000000,0.012493,0.000000,0.081167,0.017233,0.000000,0.000000,0.000000,0.011937,0.045078
8,AC007381.3,0.062500,0.125000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.039850,0.000000,0.000000,0.036594,0.055962,0.025619,0.015794,0.022250,0.000000,0.000000,0.000000,0.045281
9,AC008074.5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028086,0.000000,0.000000,0.000000


In [7]:
# 6) Run nnls regression of each sample on the signature to get one estimate per cell group for each sample
weighted_merged_table_values = weighted_merged_table.iloc[:,1:].values
weighted_signature_group_values = weighted_merged_table_values[:, :len(groups)]
x = np.zeros((len(groups), len(samples)))
for i in range(0, len(samples)):
    weighted_sample_column_values = weighted_merged_table_values[:, i+len(groups)]
    x[:,i] = nnls(weighted_signature_group_values, weighted_sample_column_values)[0] * 100
nnls_result = x.T
print(nnls_result)
np.savetxt('./data/proportions.csv', nnls_result, delimiter=',')

[[  1.90238036   5.0770915    1.2438571   14.43982555  17.32577842   0.
    1.87377192  20.56443487  14.2362233    7.43961828   1.86274023
    6.61953308   0.69593552   2.52767383   0.20111182   0.19789874]
 [  0.54654643   5.37328168   1.48449354  12.44031654  16.28416081   0.
    3.69791536  41.5899806    9.40840327   4.71370382   2.94057139
    0.13165549   1.26976602   1.11230814   6.10486746   0.        ]
 [  1.94586737   4.90955191   2.49746016  14.56558399   5.757227
    1.56312971   8.06048261  31.18093094   1.36142511   6.6346491
    3.74149084  10.48915877   0.57287247   2.32053783   0.           0.73315304]
 [  1.61452485   3.18521926   1.32902163  21.02225424  13.83459086   0.
    3.50977033  28.64702985   3.2170583    5.83892917   3.61918634
    4.96777811   0.92075082   0.74965982   0.           0.        ]
 [  2.54897846   8.38709443   1.6125236   19.06780223  11.20529197
    1.69896019   3.8575349   30.29121579   5.79357556  13.59251763
    1.81176443   4.82607449   0.1

In [8]:
# 7) Put the content of the ‘correction factors’ file into a list and multiply each estimate by the corresponding cell group correction factor
if source_file_format == "xlsx":
    correction_factors_table = pd.read_excel(correction_factors_file, correction_factors_tab, index_col=0)
else:
    correction_factors_table = pd.read_csv(correction_factors_file, sep=csv_separator, decimal=decimal_separator, index_col=0)
correction_factors = correction_factors_table.values
nnls_result[0]
result = {}
for sample in range(len(nnls_result)):
    current_sample = samples[sample]
    result[current_sample] = []
    for group in range(len(nnls_result[sample])):
        result[current_sample].append(nnls_result[sample][group]*correction_factors[group][0])
result_table = pd.DataFrame.from_dict(result, orient='index', columns=groups)
display(result_table)

Unnamed: 0,B Memory,B Naive,Basophils LD,CD4+ effector,CD8 activated,MAIT,mDCs,Monocytes C,Monocytes NC+I,Naive T cells,Neutrophils LD,NK,pDCs,Plasmablasts,VD2-,VD2+
CYFZ,1.141428,5.584801,1.408901,12.753707,20.716189,0.0,0.214089,13.000675,7.080517,11.791882,1.699132,11.680384,0.297302,0.190972,0.190818,0.194924
FY2H,0.327928,5.91061,1.681467,10.987678,19.470741,0.0,0.422507,26.292861,4.679356,7.471275,2.682295,0.23231,0.542441,0.084038,5.79238,0.0
FLWA,1.16752,5.400507,2.828841,12.86478,6.883835,3.501839,0.920955,19.712341,0.677117,10.515996,3.412868,18.508466,0.24473,0.175322,0.0,0.722131
453W,0.968715,3.503741,1.505366,18.567514,16.541825,0.0,0.401011,18.110429,1.600034,9.254771,3.301306,8.765808,0.393342,0.056639,0.0,0.0
684C,1.529387,9.225804,1.826485,16.841281,13.39801,3.806136,0.440745,19.14987,2.881488,21.544299,1.652634,8.515767,0.064514,0.753976,0.0,0.0
CZJE,2.227285,4.931393,1.345029,11.157453,9.045848,3.838032,1.110531,16.865316,2.464681,16.889595,2.348813,16.827433,0.40502,0.160025,0.0,0.0
DZQV,2.574962,5.982762,1.750653,6.41362,6.243026,4.813099,0.13035,19.925768,1.553655,16.142848,2.640308,6.169083,0.269187,0.508099,4.815093,2.986702
925L,0.826403,16.337256,2.866362,12.25772,10.963999,4.709569,0.411195,17.671864,2.610857,7.366658,3.34952,8.986852,0.362587,0.091881,0.0,1.708578
9JD4,2.680383,4.133501,1.44449,13.924825,10.371776,1.80283,0.456948,12.902036,2.950119,25.409954,1.649718,9.049992,0.608978,0.252259,1.372208,0.0
G4YW,2.265205,4.958247,0.219024,13.386038,0.0,2.268609,0.175454,21.113052,3.969119,15.736812,0.647301,17.04162,0.213492,0.324163,4.274564,0.475395


In [None]:
def csv_download(table, table_name):
    js_download = """
    var csv = '%s';

    var filename = '%s.csv';
    var blob = new Blob([csv], { type: 'text/csv;charset=utf-8;' });
    if (navigator.msSaveBlob) { // IE 10+
        navigator.msSaveBlob(blob, filename);
    } else {
        var link = document.createElement("a");
        if (link.download !== undefined) { // feature detection
            // Browsers that support HTML5 download attribute
            var url = URL.createObjectURL(blob);
            link.setAttribute("href", url);
            link.setAttribute("download", filename);
            link.style.visibility = 'hidden';
            document.body.appendChild(link);
            link.click();
            document.body.removeChild(link);
        }
    }
    """ % (table.to_csv(index=True).replace('\n','\\n').replace("'","\'"), table_name)

    return Javascript(js_download)

csv_download(result_table, "deconvolution_result_table")