In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('data/input.tsv', sep='\t', index_col='ID')
del dataset['days_to_disease_progression']

dataset = dataset.loc[~dataset['therapy_first_line_class'].isnull()]

therapy_class = dataset['therapy_first_line_class']
therapy = dataset['therapy_first_line']
del dataset['therapy_first_line']

del dataset['best_response_first_line']
del dataset['therapy_first_line_class']
del dataset['days_to_disease_progression_class']


dataset = dataset[~dataset['best_response_first_line_class'].isnull()]

dataset.iloc[:10, :10]

Unnamed: 0_level_0,best_response_first_line_class,lgh,lgl,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MMRF1011,0.0,Not Recorded,Not Recorded,0.0,0.9,0.0,123.76,3,8.28,170.0
MMRF1013,1.0,Unknown,Unknown,0.0,1.3,0.2,186.524,3,4.33,245.0
MMRF1016,0.0,IgG,Lambda,0.0,2.0,0.0,86.632,1,5.8,177.0
MMRF1017,0.0,IgG,Lambda,6.9,2.1,0.0,79.56,1,3.69,191.0
MMRF1018,0.0,IgA,Kappa,0.0,2.1,0.0,133.484,3,5.6,271.0
MMRF1029,0.0,Unknown,Kappa,0.0,8.4,0.0,106.08,1,2.6,219.0
MMRF1030,1.0,IgG,Kappa,15.4,9.6,0.0,55.692,1,2.5,215.0
MMRF1031,0.0,IgA,Unknown,18.3,10.1,0.0,81.328,1,10.29,385.0
MMRF1032,0.0,IgG,Lambda,20.7,11.1,0.0,70.72,2,1.3,166.0
MMRF1033,0.0,IgG,Kappa,18.5,12.0,0.0,79.56,1,3.99,307.0


In [3]:
len(therapy_class)

711

In [4]:
def generate_metric(t, auc, tn, fp, fn, tp, title='THERAPY'):
    
    sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

    precision =  (tp / float(tp + fp)) if tp + fp > 0 else 1

    specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

    ks = abs(sensitivity + specificity - 1.)

    ifp = (float(tp + fp) / tp) if tp > 0 else -np.inf

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    row = pd.DataFrame({title: [t], 'AUC': auc, 'Overall Accuracy': accuracy, 
                        'Precision': precision, 'Sensitivity': sensitivity, 'Specificity': specificity,
                        'KS': ks, 'IFP': ifp})
    
    return row

In [5]:
def optimize_threshold(y_true, y_):

    t, max_metric = None, -np.inf

    for i in np.arange(0.00, max(y_), 0.01):

        y_hat = np.copy(y_)

        filter__ = y_hat >= i

        y_hat[filter__], y_hat[~filter__] = 1, 0

        tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

        sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

        specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

        ks = abs(sensitivity + specificity - 1.)
        
        auc = roc_auc_score(y_true, y_hat)
        
        metric = ks

        if metric > max_metric and metric is not np.inf:

            max_metric = metric

            t = i

    return t

In [18]:
# loading gene counts
gene_fpkm = pd.read_csv('data/gene_fpkm.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_fpkm.columns:
    if '_1_' not in col:
        del gene_fpkm[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_fpkm = gene_fpkm.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_fpkm.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_fpkm.index = [int(col.split('_')[1]) for col in gene_fpkm.index]

# selected class
gene_details = pd.read_table('data/gene_details.tsv', sep='\t')

gene_selected_class = pd.read_table('data/gene_selected_class.tsv', sep='\t')

gene_selected_class = gene_details.merge(gene_selected_class, on='gene_biotype').set_index('ensembl_gene_id')

gene_selected_class = [gen for gen in gene_selected_class.index if gen in gene_fpkm.columns]

gene_fpkm = gene_fpkm[gene_selected_class]

# removing genes with zero sum
gene_fpkm = gene_fpkm[list(gene_fpkm.sum(axis=0).index[(gene_fpkm.sum(axis=0) > 0).tolist()])]

gene_fpkm.index = ['MMRF' + str(m) for m in gene_fpkm.index]

gene_fpkm.shape

import pickle as pkl

file_path = 'data/output/selected_genes_g2.pkl'.format(col.split('_')[-1].lower())
    
with open(file_path, 'rb') as file:
    selected_genes = pkl.load(file)
    
gene_fpkm_selected = gene_fpkm[[gen for gen in selected_genes if gen in gene_fpkm.columns]]

del gene_fpkm

gene_fpkm_selected = gene_fpkm_selected.loc[:, ~gene_fpkm_selected.columns.duplicated(keep='first')]

gene_fpkm_selected.shape



(779, 1696)

In [7]:
from scipy.special import erfinv

class GaussRankScaler():

    def __init__( self ):
        self.epsilon = 0.001
        self.lower = -1 + self.epsilon
        self.upper = 1 - self.epsilon
        self.range = self.upper - self.lower
        self.mean = None

    def fit_transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )
        
        self.mean = np.mean(X, axis=0)

        return transformed - self.mean
    
    def transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )

        return transformed - self.mean

In [29]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

index = []

result, detailed_result = None, None

for ic, col in enumerate(dataset.columns[1:]):
    
    all_ = None
    
    dat = dataset[[dataset.columns[0], col]].copy().join(therapy.dropna(), how='inner')
    
    for column in dat:
        
        if dat[column].dtype == 'object':

            values = pd.get_dummies(dat[column])

            values.columns = [column + '_' + str(col) for col in values.columns]
        else:
            values = dat[column].fillna(0)

        all_ = values if all_ is None else pd.concat([all_, values], axis=1)
    
    all_ = all_.join(gene_fpkm_selected).fillna(0)
    
    x = all_.values[:,1:]

    y = all_.values[:,0]

    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        s = GaussRankScaler()

        x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

        x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': 4,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t

        result = row if result is None else pd.concat([result, row])
        
        detailed_row = pd.DataFrame({'fold': i + 1, 'variable': col, 'y_opt': [int(y >= t) for y in y_], 
                                     'y_hat': [max(min(1, yy), 0) for yy in y_ * .5 / t], 'y': list(y_valid), 
                                     'class': list(therapy_class[valid_index])}, index=list(all_.index[valid_index]))
        
        detailed_result = detailed_row if detailed_result is None else pd.concat([detailed_row, detailed_result])
        
        index.append(col)
        
result.index = index

result

Unnamed: 0,Fold,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
lgh,1,0.504808,0.750000,0.428571,0.187500,0.923077,0.110577,2.333333,0.36
lgh,2,0.532452,0.794118,0.750000,0.187500,0.980769,0.168269,1.333333,0.37
lgh,3,0.675481,0.705882,0.250000,0.125000,0.884615,0.009615,4.000000,0.36
lgh,4,0.588942,0.661765,0.181818,0.125000,0.826923,0.048077,5.500000,0.35
lgh,5,0.759615,0.808824,0.800000,0.250000,0.980769,0.230769,1.250000,0.45
lgh,6,0.627404,0.764706,0.500000,0.250000,0.923077,0.173077,2.000000,0.38
lgh,7,0.760817,0.779412,0.533333,0.500000,0.865385,0.365385,1.875000,0.30
lgh,8,0.806410,0.776119,0.500000,0.200000,0.942308,0.142308,2.000000,0.44
lgh,9,0.627451,0.681818,0.333333,0.400000,0.764706,0.164706,3.000000,0.28
lgh,10,0.679739,0.712121,0.333333,0.266667,0.843137,0.109804,3.000000,0.29


In [30]:
result.index = [str(i) + " + genexp" for i in result.index]

In [31]:
result_mean = result.groupby(level=0).mean()

del result_mean['Fold']

result_mean.to_csv('output/gen_overall.tsv', sep='\t', index=True)

result_mean

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
11p15 + genexp,0.655196,0.739154,0.443414,0.256667,0.885747,0.151067,2.6,0.357
13q14 + genexp,0.654929,0.749693,0.450895,0.2625,0.897549,0.168703,2.505,0.356
13q34 + genexp,0.652905,0.760053,0.478651,0.244583,0.91678,0.163286,2.376667,0.367
15q15 + genexp,0.653728,0.749559,0.44873,0.250417,0.901244,0.151661,2.508333,0.358
17p13 + genexp,0.651816,0.737772,0.417389,0.262917,0.881976,0.157393,2.727381,0.344
19q13 + genexp,0.65003,0.751141,0.461896,0.269583,0.897474,0.171865,2.482857,0.359
1q21 + genexp,0.649024,0.743655,0.446005,0.25625,0.891629,0.156533,2.618333,0.356
20q13 + genexp,0.654009,0.749582,0.449544,0.256667,0.899321,0.155988,2.492262,0.367
21q22 + genexp,0.650344,0.749514,0.450259,0.275833,0.893477,0.16931,2.454167,0.355
3q21 + genexp,0.651572,0.749493,0.436172,0.25625,0.899284,0.157457,2.625,0.357


In [32]:
def compute_all(x):
    
    a = None
    
    try:
        a = roc_auc_score(x['y_opt'], x['y'])
    except:
        a = np.inf
    
    try:
        tn, fp, fn, tp = confusion_matrix(x['y'], x['y_opt']).ravel()
    except:
        tn, fp, fn, tp = np.inf,np.inf,np.inf,np.inf

    result = {'auc': a, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}
    
    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')
    
    for c in row:
        result[c] = row[c][0]
    
    return pd.Series(result)    
    
detailed_result.groupby(['class', 'variable']).apply(compute_all).unstack(level='class')

Unnamed: 0_level_0,auc,auc,auc,auc,tn,tn,tn,tn,fp,fp,...,Specificity,Specificity,KS,KS,KS,KS,IFP,IFP,IFP,IFP
class,Bortezomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,IMIDs-based,Bortezomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,IMIDs-based,Bortezomib-based,Combined IMIDs/carfilzomib-based,...,Combined bortezomib/IMIDs-based,IMIDs-based,Bortezomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,IMIDs-based,Bortezomib-based,Combined IMIDs/carfilzomib-based,Combined bortezomib/IMIDs-based,IMIDs-based
variable,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11p15,0.601709,inf,0.58701,0.743333,94.0,inf,323.0,41.0,15.0,inf,...,0.884932,0.953488,0.165416,,0.119166,0.261181,2.5,,2.615385,1.5
13q14,0.620753,inf,0.597748,0.758865,96.0,inf,328.0,40.0,13.0,inf,...,0.89863,0.930233,0.183764,,0.123855,0.391771,2.3,,2.48,1.5
13q34,0.643054,inf,0.612962,0.77551,98.0,inf,335.0,41.0,11.0,inf,...,0.917808,0.953488,0.202113,,0.125015,0.338104,2.1,,2.304348,1.4
15q15,0.646693,inf,0.5894,0.729167,97.0,inf,329.0,40.0,12.0,inf,...,0.90137,0.930233,0.223242,,0.108577,0.314848,2.090909,,2.565217,1.6
17p13,0.616711,inf,0.581433,0.692671,94.0,inf,323.0,39.0,15.0,inf,...,0.884932,0.906977,0.195719,,0.110157,0.291592,2.363636,,2.68,1.8
19q13,0.631439,inf,0.594296,0.784783,97.0,inf,327.0,40.0,12.0,inf,...,0.89589,0.930233,0.192939,,0.121116,0.468694,2.2,,2.52,1.428571
1q21,0.635946,inf,0.584516,0.743333,96.0,inf,324.0,41.0,13.0,inf,...,0.887671,0.953488,0.214067,,0.112896,0.261181,2.181818,,2.64,1.5
20q13,0.620753,inf,0.597748,0.77551,96.0,inf,328.0,41.0,13.0,inf,...,0.89863,0.953488,0.183764,,0.123855,0.338104,2.3,,2.48,1.4
21q22,0.646693,inf,0.596594,0.723913,97.0,inf,326.0,39.0,12.0,inf,...,0.893151,0.906977,0.223242,,0.127385,0.368515,2.090909,,2.5,1.666667
3q21,0.635946,inf,0.591902,0.77551,96.0,inf,328.0,41.0,13.0,inf,...,0.89863,0.953488,0.214067,,0.114846,0.338104,2.181818,,2.541667,1.4


In [36]:
detailed = detailed_result.groupby(['class', 'variable']).apply(compute_all)

detailed = detailed.reset_index().set_index('variable')

for class_ in detailed['class'].unique():
    
    current_class = detailed.loc[detailed['class'] == class_].iloc[:,1:]
    
    current_class.to_csv('output/gen_{}.tsv'.format(class_.lower().replace(' ', '_').replace('/', '_')))