# Dream Challenge deconvolution

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor


In [1]:
import os
import errno
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score, train_test_split, ShuffleSplit,GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import NuSVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


### Load DREAM input and our GEPs
Need to specify:
* in_dir
* gold_standards_file
* gold_standards_dir
* signature_path
* drop['others']?

In [12]:
docker = False
coarse = True

if docker:
    in_name = "input.csv"
    in_dir = "./input"
    in_path = os.path.join(in_dir, in_name)

    out_name= "predictions.csv"
    out_dir = "./output"
    out_path = os.path.join(out_dir, out_name)

    input_df = pd.read_csv(in_path)
    crc_gep = pd.read_csv("signatures/ileal_merged.csv", index_col='NAME')
    brca_gep = pd.read_csv("signatures/brca.gct", sep='\t', index_col='NAME')

else:
    in_name = "input.csv"
    #in_dir = "./input/leaderboard_1"
    #in_dir = "./input/leaderboard_3"
    in_dir = "./input/roche"
    in_path = os.path.join(in_dir, in_name)
    
    out_dir = "./output"
    out_load = "load.csv"
    if os.path.isfile(os.path.join(out_dir, out_load)):
        load_df = pd.read_csv(os.path.join(out_dir, out_load))
    else:
        load_df = pd.DataFrame( columns = ['algorithm','truth'])
        # TODO implement dataset folder naming - dream vs epic, etc
        #load_df = pd.DataFrame( columns = ['algorithm', 'validation_file'])
        
    #####
    #gold_standards_file = "lb_fine_r1"
    #gold_standards_dir = "./gold_standards/"
    
    #gold_standards_file = "lb_fine_r3"
    #gold_standards_dir = "./gold_standards/"
    
    gold_standards_file = "GSE134809-truth"
    gold_standards_dir = "./gold_standards/roche"
    
    gold_standards_path = os.path.join(gold_standards_dir, gold_standards_file + '.csv')
    
    #####
    # TODO
    #crc_path = "signatures/GSE134809_human_ileal_Crohns_filtered.gct"
    #crc_path = "signatures/ileal_merged.csv"
    #crc_path = 'signatures/generated/ileal-smilie-raw-scanorama.csv'
    #crc_path = 'signatures/generated/ileal-smilie-raw-union-scanorama.csv'
    #crc_path = 'signatures/generated/ileal-smilie-ibd-raw-scanorama.csv'
    crc_path = 'signatures/generated/ileal-smilie-brca-neutro-rmCells-scanorama.csv'
    #crc_path = 'signatures/generated/ileal-smilie-brca-neutro-rmCells-union-scanorama.csv'
    brca_path = "signatures/brca.gct"
    #signature_path = "signatures/smillie2019_human_ibd.gct"
    #signature_path = "signatures/generated/lm22_dream.gct"
    
    crc_gep = pd.read_csv(crc_path, index_col='NAME')
    brca_gep = pd.read_csv(brca_path, sep='\t', index_col='NAME')
    
    input_df = pd.read_csv(in_path)
    
    


In [13]:
crc_gep = crc_gep.fillna(0)
crc_gep = crc_gep.where(crc_gep != 0, 0)

### Signature modifications

In [14]:
crc_gep

Unnamed: 0_level_0,Description,NK.cells,endothelial.cells,fibroblasts,macrophages,memory.B.cells,memory.CD4.T.cells,memory.CD8.T.cells,monocytes,myeloid.dendritic.cells,naive.B.cells,naive.CD4.T.cells,naive.CD8.T.cells,neutrophils,regulatory.T.cells
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ENSG00000175899,A2M,0.000,0.067,0.046,0.012,0.001,0.002,0.001,0.005,0.000,0.001,0.001,0.001,0.000,0.001
ENSG00000245105,A2M-AS1,0.001,-0.001,-0.001,0.000,0.000,0.002,0.002,0.000,0.000,0.000,0.001,0.000,0.001,0.000
ENSG00000141338,ABCA8,0.000,0.000,0.020,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
ENSG00000231621,AC013264.2,0.001,-0.001,-0.001,0.000,0.000,0.013,0.004,0.001,0.000,0.001,0.020,0.021,0.010,0.003
ENSG00000224137,AC079767.4,0.000,0.000,0.000,0.001,0.027,0.000,0.000,0.001,0.001,0.016,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000101443,WFDC2,0.000,0.000,0.003,0.000,0.001,0.001,0.001,0.001,0.000,0.001,0.001,0.000,0.000,0.001
ENSG00000143184,XCL1,0.036,0.001,0.001,0.000,0.000,0.002,0.024,0.000,0.000,0.000,0.002,0.001,0.017,0.001
ENSG00000143185,XCL2,0.042,-0.001,-0.001,0.000,0.000,0.001,0.020,0.000,0.000,0.000,0.000,0.000,0.012,-0.001
ENSG00000178381,ZFAND2A,0.003,0.003,0.001,0.002,0.005,0.007,0.005,0.004,0.002,0.004,0.010,0.008,0.006,0.008


In [15]:
crc_gep.drop(['Description'], axis=1, inplace=True)
crc_gep = crc_gep.fillna(0)
crc_gep = crc_gep.where(crc_gep != 0, 0)
#crc_gep.drop(['Description', 'others', 'myeolid.cells'], axis=1, inplace=True)
#crc_gep = np.log1p(crc_gep)
#crc_gep /= 100 #cp10k -> tpm
#crc_gep.drop(['Description'], axis=1, inplace=True)

brca_gep.drop(['Description', 'others', 'cancer.cells'], axis=1, inplace=True)
brca_gep = brca_gep.fillna(0)
brca_gep = brca_gep.where(crc_gep != 0, 0)
#brca_gep = np.log1p(brca_gep)


In [16]:
crc_gep.shape

(528, 14)

In [17]:
dataset_names = input_df['dataset.name']
scales = input_df['scale']
c_types = input_df['cancer.type']
#native_probe = input_df['native.probe.type']
#expression_files = input_df['hugo.expr.file']
expression_files = input_df['ensg.expr.file']
expression_paths = []

for file in expression_files:
    expression_paths.append(os.path.join(in_dir, file))
    

def load_expression_file(expression_path):
    expression_df = pd.read_csv(expression_path, index_col = 'Gene')
    return expression_df


In [18]:
# TODO
# deal with duplicate genes in sample bulk and in signature genes
# Note: do not merge samples - not all genes are intersecting across samples

### nu-SVR without CV:

In [19]:

def intersect_genes(bulk_sample, signature_matrix):
    """Find intersecting subset of genes between sample and signature matrix"""
    #TODO add warning if only few genes are common
    
    # drop rows with nan
    bulk_sample_notna = bulk_sample[pd.notna(bulk_sample.iloc[:,0:1]).any(axis=1)]
    
    idx = bulk_sample_notna.index & signature_matrix.index
    return bulk_sample_notna.loc[idx], signature_matrix.loc[idx]


def build_model(bulk_sample_subset, signature_matrix_subset):
    """Create and fit a regression model to bulk RNA (y) and a signature matrix (X)
    
    y = w * X | w = 'predicted cell type fractions'
    
    
    Args:
        bulk_sample_subset (pandas.DataFrame): bulk RNA
        signatre_matrix_subset (pandas.DataFrame): GEP
    
    Returns:
        Instance of the regression model
    
    """
    
    pipe = Pipeline(steps=[
        ('scale', StandardScaler()),
        ('svr', NuSVR())
    ])
    
    cv = ShuffleSplit(test_size=0.01, n_splits=1) # no CV - we want to minimize training error and not generalization error
    parameters = {'svr__nu' : [0.25, 0.5, 0.75], 'svr__C' : [1e-2,1e-1,1], 'svr__kernel' : ['linear'], 'svr__verbose' : [True]}
    grid = GridSearchCV(pipe, param_grid = parameters, cv = cv, scoring = 'neg_mean_squared_error', verbose=10, n_jobs=-1)
    grid.fit(signature_matrix_subset, bulk_sample_subset.values.ravel())

    return grid

# following methods are temporary
def rm_neg(coefs):
    """Removes negative coefficients from the weight vector
    
    Args:
        coefs (numpy.array): coefficients from the regression model, representing cell fractions
    
    Returns:
        numpy.array with non-negative values
    
    """
    c = coefs.copy()
    c[c < 0] = 0 
    return c

def rm_small(coefs):
    """Remove small weight vector coefficents based on an arbitrary cutoff"""
    # improvement: use standard deviation
    c = coefs.copy()
    c[c < 0.00001] = 0
    return c

def rescale (coefs):
    """Rescale the weight vector so that the total sums up to 1"""
    c = coefs.copy()
    scale_factor = 1.0 / (np.sum(c))
    c *= scale_factor
    return c

def append_missing_celltype(df, dataset_name, sample, cell_type, prediction=0):
    """Appends missing cell type to the output
    
    Useful until we are able to integrate signatures with a full set of cell types
    
    Args:
        df (pandas.DataFrame): result df to be appended to
        dataset_name (string)
        sample (string)
        cell_type (string)
    
    Returns:
        pandas.DataFrame with appended celltype and its predicted proportion set to 0
    
    """
    
    res_df = df.copy()
    append_df = { "dataset.name" : dataset_name, 
                 "sample.id" : sample, 
                 "cell.type" : cell_type, 
                 "prediction" : prediction}
    res_df = res_df.append(append_df, ignore_index=True)
    return res_df

def linearize(bulk, scale):
    """Takes anti-log transformation on log scaled data
    
    Some data inputs from the DREAM challenge are log2 or log10 scaled and need to be linearized.
    
    Args:
        bulk (pandas.DataFrame): Bulk expression matrix
        scale (String): The scale of the expression data (i.e., Log2, Log10, Linear)
    Returns:
        Linearized bulk expression matrix
    
    """
    
    if scale == 'Log2':
        return 2 ** bulk
    elif scale == 'Log10':
        return 10 ** bulk
    else:
        return bulk
    
def to_coarse(result_df):
    """Converts fine grained cell type predictions to coarse grained. 
    
    This is done by simply summing up the fractions.
    
    Args:
        result_df (pandas.DataFrame): Result table with predicted fine grained cell type fractions as defined in the DREAM challenge
    Returns:
        pandas.DataFrame result table with coarse-grained cell type predictions as
        
    """
    
    dict_coarse = {'memory.B.cells' : 'B.cells', 
               'naive.B.cells' : 'B.cells',
               'memory.CD4.T.cells' : 'CD4.T.cells',
               'naive.CD4.T.cells' : 'CD4.T.cells',
               'regulatory.T.cells' : 'CD4.T.cells',
               'memory.CD8.T.cells' : 'CD8.T.cells',
               'naive.CD8.T.cells' : 'CD8.T.cells',
               'monocytes' : 'monolytic.lineage',
               'myeloid.dendritic.cells' : 'monolytic.lineage',
               'macrophages' : 'monolytic.lineage',
              }

    coarse_df = result_df.replace({'cell.type' : dict_coarse})
    coarse_df = coarse_df.groupby(['dataset.name', 'sample.id', 'cell.type']).sum().reset_index()
    return coarse_df
    

In [20]:
# for nu-SVR
result_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
param_df = pd.DataFrame(columns =['dataset.name', 'sample.id'])
for dataset_name, c_type, scale, expression_path in list(zip(dataset_names, c_types, scales, expression_paths)):
    samples_df = load_expression_file(expression_path)
    
    samples_df = linearize(samples_df, scale)
    
    # TODO this is dirty code
    #c_type = 'BRCA'
    
    if c_type == 'BRCA':
        signature_dream = brca_gep.copy()
    else:
        signature_dream = crc_gep.copy()
   
    for sample in samples_df:
        print('Deconvoluting dataset {:6} and sample [name: {:3}] [{} out of {}]'.format(dataset_name, sample, samples_df.columns.get_loc(sample) + 1, samples_df.shape[1]))
        
        bulk_sample_subset, signature_matrix_subset = intersect_genes(samples_df[[sample]], signature_dream)
        grid = build_model(bulk_sample_subset, signature_matrix_subset)
        estimator = grid.best_estimator_.named_steps['svr']
        fractions = estimator.coef_[0]
        
        fractions = rm_neg(fractions)
        fractions = rescale(fractions)
        
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
        out_df['cell.type'] = signature_matrix_subset.columns
        out_df['prediction'] = fractions
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = sample
        
        # TODO remove once integrated GEPs
        if c_type != 'BRCA':
            #out_df = append_missing_celltype(out_df, dataset_name, sample, cell_type="neutrophils")
            #mono = out_df.loc[out_df['cell.type']=='macrophages', 'prediction'].values + out_df.loc[out_df['cell.type']=='myeloid.dendritic.cells', 'prediction'].values
            #out_df = append_missing_celltype(out_df, dataset_name, sample, cell_type="monocytes", prediction = mono[0])
            pass
        
        
        
        if docker:
            out_path = os.path.join(out_dir, out_name)
            result_df = result_df.append(out_df, ignore_index=True)
            result_df.to_csv(out_path, header=True, index=False)
        else:
            out_name = str(len(load_df)) + "_" + str(type(estimator).__name__)
            
            # todo loop over param dictionary, add keys as columns, values as values.
            
            p_dict = {'dataset.name' : dataset_name, 'sample.id' : sample}

            for key, value in grid.best_params_.items():
                if str(key) not in param_df:
                    param_df[str(key)] = np.nan
                p_dict[str(key)] = value
            
            param_df = param_df.append(p_dict, ignore_index = True)
            param_df.to_csv(os.path.join(out_dir,'param_' + out_name + gold_standards_file + '.csv'), header=True, index=False)
            
            out_path = os.path.join(out_dir, out_name + '_predict_' + gold_standards_file + '.csv')
            result_df = result_df.append(out_df, ignore_index=True)
            result_df.to_csv(out_path, header=True, index=False)

if not docker:
    load_row = [out_name + '_predict_' + gold_standards_file + '.csv', gold_standards_path]
    load_df.loc[len(load_df)] = load_row
    load_df.to_csv(os.path.join(out_dir, out_load), header=True, index=False)

Deconvoluting dataset GSE134809 and sample [name: S0 ] [1 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    2.6s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    2.6s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    2.6s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    2.6s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    2.7s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    2.7s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0178s.) Setting batch_size=22.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | 

[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S1 ] [2 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S2 ] [3 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0166s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0177s.) Setting batch_size=22.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out o

[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S3 ] [4 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S4 ] [5 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0160s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend w

[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S5 ] [6 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S6 ] [7 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0160s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  

[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S7 ] [8 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S8 ] [9 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0153s.) Setting batch_size=26.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0154s.) Setting batch_size=26.
[Parallel(n_jobs=-1)]: Done   2 out of   9

[LibSVM]Deconvoluting dataset GSE134809 and sample [name: S9 ] [10 out of 10]
Fitting 1 folds for each of 9 candidates, totalling 9 fits
[LibSVM]

[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0159s.) Setting batch_size=24.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished


### nu-SVR speedup

- consider normalising both signature matrix and bulk vector to zero mean and unit variance - helps speedup

In [None]:

def build_model(bulk_sample_subset, signature_matrix_subset):
    """Create and fit a regression model to bulk RNA (y) and a signature matrix (X)
    
    y = w * X | w = 'predicted cell type fractions'
    
    
    Args:
        bulk_sample_subset (pandas.DataFrame): bulk RNA
        signatre_matrix_subset (pandas.DataFrame): GEP
    
    Returns:
        Instance of the regression model
    
    """
    pipe = Pipeline(steps=[
        ('scale', StandardScaler()),
        ('svr', NuSVR())
    ])
    
    
    
    parameters = {'svr__nu' : [0.25, 0.5, 0.75], 'svr__C' : [1e-2,1e-1,1], 'svr__kernel' : ['linear'], 'svr__verbose' : [True]}
    grid = GridSearchCV(pipe, param_grid = parameters, cv = 5, scoring = 'neg_mean_squared_error', verbose=10, n_jobs=-1)
    grid.fit(signature_matrix_subset, bulk_sample_subset.values.ravel())

    return grid


In [None]:
# using sklearn.svm.LinearSVR
def build_model(bulk_sample_subset, signature_matrix_subset):
    """Create and fit a regression model to bulk RNA (y) and a signature matrix (X)
    
    y = w * X | w = 'predicted cell type fractions'
    
    
    Args:
        bulk_sample_subset (pandas.DataFrame): bulk RNA
        signatre_matrix_subset (pandas.DataFrame): GEP
    
    Returns:
        Instance of the regression model
    
    """
    
    
    model = LinearSVR()
    parameters = {'epsilon' : [0.25, 0.5, 0.75], 'C' : [1e-2,1e-1,1], 'verbose' : [True], 'loss' : ['squared_epsilon_insensitive']}
    grid = GridSearchCV(model, param_grid = parameters, cv = 5, scoring = 'neg_mean_squared_error', verbose=10, n_jobs=-1)
    grid.fit(signature_matrix_subset, bulk_sample_subset.values.ravel())

    return grid


In [None]:
# for linear svr
result_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
param_df = pd.DataFrame(columns =['dataset.name', 'sample.id', 'epsilon', 'C'])
for dataset_name, expression_path in list(zip(dataset_names, expression_paths)):
    samples_df = load_expression_file(expression_path)
    for sample in samples_df:
        bulk_sample_subset, signature_matrix_subset = intersect_genes(samples_df[[sample]], signature_dream)
        grid = build_model(bulk_sample_subset, signature_matrix_subset)
        fractions = grid.best_estimator_.coef_
        
        fractions = rm_neg(fractions)
        fractions = rescale(fractions)
        
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
        out_df['cell.type'] = signature_matrix_subset.columns
        out_df['prediction'] = fractions
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = sample
        
        p_epsilon =  grid.best_params_['epsilon']
        p_C = grid.best_params_['C']
        p_row = [dataset_name, sample, p_epsilon, p_C]
        param_df.loc[len(param_df)] = p_row
        param_df.to_csv('param.csv', header=True, index=False)
        
        result_df = result_df.append(out_df, ignore_index=True)
        result_df.to_csv(out_path, header=True, index=False) # because gridsearch for SVR so slow - output each step

## Appendix

In [None]:
# for nu-SVR but with 5-fold CV
def intersect_genes(bulk_sample, signature_matrix):
    """Find intersecting subset of genes between sample and signature matrix"""
    #TODO add warning if only few genes are common
    
    # drop rows with nan
    bulk_sample_notna = bulk_sample[pd.notna(bulk_sample.iloc[:,0:1]).any(axis=1)]
    
    idx = bulk_sample_notna.index & signature_matrix.index
    return bulk_sample_notna.loc[idx], signature_matrix.loc[idx]


def build_model(bulk_sample_subset, signature_matrix_subset):
    """Create and fit a regression model to bulk RNA (y) and a signature matrix (X)
    
    y = w * X | w = 'predicted cell type fractions'
    
    
    Args:
        bulk_sample_subset (pandas.DataFrame): bulk RNA
        signatre_matrix_subset (pandas.DataFrame): GEP
    
    Returns:
        Instance of the regression model
    
    """
    
    pipe = Pipeline(steps=[
        ('scale', StandardScaler()),
        ('svr', NuSVR())
    ])
    
    parameters = {'svr__nu' : [0.25, 0.5, 0.75], 'svr__C' : [1e-2,1e-1,1], 'svr__kernel' : ['linear'], 'svr__verbose' : [True]}
    grid = GridSearchCV(pipe, param_grid = parameters, cv = cv, scoring = 'neg_mean_squared_error', verbose=10, n_jobs=-1)
    grid.fit(signature_matrix_subset, bulk_sample_subset.values.ravel())

    return grid

# following methods are temporary
def rm_neg(coefs):
    """Removes negative coefficients from the weight vector
    
    Args:
        coefs (numpy.array): coefficients from the regression model, representing cell fractions
    
    Returns:
        numpy.array with non-negative values
    
    """
    c = coefs.copy()
    c[c < 0] = 0 
    return c

def rm_small(coefs):
    """Remove small weight vector coefficents based on an arbitrary cutoff"""
    # improvement: use standard deviation
    c = coefs.copy()
    c[c < 0.00001] = 0
    return c

def rescale (coefs):
    """Rescale the weight vector so that the total sums up to 1"""
    c = coefs.copy()
    scale_factor = 1.0 / (np.sum(c))
    c *= scale_factor
    return c

def append_missing_celltype(df, dataset_name, sample, cell_type):
    """Appends missing cell type to the output
    
    Useful until we are able to integrate signatures with a full set of cell types
    
    Args:
        df (pandas.DataFrame): result df to be appended to
        dataset_name (string)
        sample (string)
        cell_type (string)
    
    Returns:
        pandas.DataFrame with appended celltype and its predicted proportion set to 0
    
    """
    
    res_df = df.copy()
    append_df = { "dataset.name" : dataset_name, 
                 "sample.id" : sample, 
                 "cell.type" : cell_type, 
                 "prediction" : 0}
    res_df = res_df.append(append_df, ignore_index=True)
    return res_df


In [None]:
# for GridSearchCV
"""Issue with Ridge: it penalises model complexity while minimising variance in weights assigned to highly correlated predictors.
Thus, we have issues deconvoluting highly correlated cell types e.g. Naive B Cell vs Memory B Cell.
"""
result_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
for dataset_name, expression_path in list(zip(dataset_names, expression_paths)):
    samples_df = load_expression_file(expression_path)
    for sample in samples_df:
        bulk_sample_subset, signature_matrix_subset = intersect_genes(samples_df[[sample]], signature_dream)
        grid = build_model(bulk_sample_subset, signature_matrix_subset)
        fractions = grid.best_estimator_.coef_[0]
        
        fractions = rm_neg(fractions)
        #fractions = rm_small(fractions)
        fractions = rescale(fractions)
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
        out_df['cell.type'] = signature_matrix_subset.columns
        out_df['prediction'] = fractions
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = sample
        
        #temp
        out_df = append_missing_celltype(dataset_name, sample, cell_type="neutrophils")
        out_df = append_missing_celltype(dataset_name, sample, cell_type="monocytes")
        
        result_df = result_df.append(out_df, ignore_index=True)
        result_df.to_csv(out_path, header=True, index=False)

In [None]:
# for random forest
result_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
for dataset_name, expression_path in list(zip(dataset_names, expression_paths)):
    samples_df = load_expression_file(expression_path)
    for sample in samples_df:
        bulk_sample_subset, signature_matrix_subset = intersect_genes(samples_df[[sample]], signature_dream)
        model = build_model(bulk_sample_subset, signature_matrix_subset)

        fractions = model.feature_importances_
        
        fractions = rm_neg(fractions)
        #fractions = rm_small(fractions)
        #fractions = rescale(fractions)
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
        out_df['cell.type'] = signature_matrix_subset.columns
        out_df['prediction'] = fractions
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = sample
        result_df = result_df.append(out_df, ignore_index=True)
        

In [None]:
# for linear regression
result_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
for dataset_name, expression_path in list(zip(dataset_names, expression_paths)):
    samples_df = load_expression_file(expression_path)
    for sample in samples_df:
        bulk_sample_subset, signature_matrix_subset = intersect_genes(samples_df[[sample]], signature_dream)
        model = build_model(bulk_sample_subset, signature_matrix_subset)

        fractions = model.coef_[0]
        #fractions = model.best_estimator_.coef_
        
        fractions = rm_neg(fractions)
        #fractions = rm_small(fractions)
        fractions = rescale(fractions)
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'prediction'])
        out_df['cell.type'] = signature_matrix_subset.columns
        out_df['prediction'] = fractions
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = sample
        result_df = result_df.append(out_df, ignore_index=True)
        