In [26]:
from IPython.display import HTML
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')

# 1. Load and Format Data

This block aims at loading and formatting our iss and fish variables. Our id corresponde to the MMRF identifier.

In [1]:
import pandas as pd
import qgrid

# grid options
grid_options = {'forceFitColumns': False}

# loading iss and fish variables
iss_fish_vars = pd.read_csv('data/iss_fish_therapy_response.csv', sep='\t')

# all column name to upper case
iss_fish_vars.columns = [col.upper() for col in iss_fish_vars.columns]

# Renaming ID columns
iss_fish_vars = iss_fish_vars.rename(columns={'MMRF': 'ID'})

# Renaming ISS to Stage
iss_fish_vars = iss_fish_vars.rename(columns={'ISS': 'STAGE'})

# transforming mmrf ids to integers
iss_fish_vars['ID'] = iss_fish_vars['ID'].str.replace('MMRF', '').astype(int)

# setting index
iss_fish_vars = iss_fish_vars.set_index('ID')

# stage string to int
iss_fish_vars['STAGE'] = iss_fish_vars['STAGE'].map({'I': 1, 'II': 2, 'III': 3})

# count initial sample
initial_sample = iss_fish_vars.shape[0]

# drop invalid patients
iss_fish_vars = iss_fish_vars.dropna(subset=['BEST-RESPONSE-FIRSTLINE', 'FIRST-LINE-THERAPY'])

# count sample after dropping patients without therapy or response
therapy_and_response_sample = iss_fish_vars.shape

# removing unused variables
for col in ['DAYS-TO-OVERALL-SURVIVAL', 'DAYS-TO-PROGRESSION', 
            'PATIENT-FIRSTRESPONSE', 'FIRST-LINE-THERAPY-CLASS', 'FIRST-LINE-STARTING-TREATMENT-REGIMEN']:
    del iss_fish_vars[col]

qgrid.show_grid(iss_fish_vars, grid_options=grid_options)

QgridWidget(grid_options={'enableColumnReorder': False, 'defaultColumnWidth': 150, 'editable': True, 'rowHeigh…

# 2. Format and Group the Response Variable

We group our response variable, a description of the patient's theray response, based on clinical protocols described by doctors specialized in Oncology.

In [2]:
%matplotlib inline

groups = {1: (['SCR'], ['CR', 'VGPR', 'PR', 'SD', 'PD']),
          2: (['SCR', 'CR'], ['VGPR', 'PR', 'SD', 'PD']),
          3: (['SCR', 'CR', 'VGPR'], ['PR', 'SD', 'PD']),
          4: (['SCR', 'CR', 'VGPR', 'PR'], ['SD', 'PD']),
          5: (['SCR', 'CR', 'VGPR', 'PR', 'SD'], ['PD']),
          6: (['SCR'], ['CR', 'VGPR'], ['PR', 'SD', 'PD']),
          7: (['SCR'], ['CR', 'VGPR', 'PR'], ['SD', 'PD'])}

selected_group = 2

# split response variable and drop it from iss and fish variables
if 'BEST-RESPONSE-FIRSTLINE' in  iss_fish_vars.columns:
    
    response_var = iss_fish_vars['BEST-RESPONSE-FIRSTLINE']
    
    del iss_fish_vars['BEST-RESPONSE-FIRSTLINE'] 

    response_var = response_var.apply(lambda x: 1 if x in groups[selected_group][0] else 0)

# plot count per class
print('Count per class')
    
for i, j in pd.DataFrame(response_var).groupby(by='BEST-RESPONSE-FIRSTLINE').apply(lambda x: len(x)).iteritems():
    print('{}: {}'.format(i, j))


Count per class
0: 566
1: 175


# 3. Load and Join Gene Expressions

In [4]:
# loading gene counts
gene_counts = pd.read_table('data/gene_counts.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_counts.columns:
    if '_1_' not in col:
        del gene_counts[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_counts = gene_counts.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_counts.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_counts.index = [int(col.split('_')[1]) for col in gene_counts.index]

# selected class
gene_details = pd.read_table('data/gene_details.tsv', sep='\t')

gene_selected_class = pd.read_table('data/gene_selected_class.tsv', sep='\t')

gene_selected_class = gene_details.merge(gene_selected_class, on='gene_biotype').set_index('ensembl_gene_id')

gene_selected_class = list(gene_selected_class.index)

gene_counts = gene_counts[gene_selected_class]

# removing genes with zero sum
gene_counts = gene_counts[list(gene_counts.sum(axis=0).index[(gene_counts.sum(axis=0) > 0).tolist()])]

# normalizing counts between 0 and 1
gene_relative = (gene_counts.T / gene_counts.sum(axis=1)).T

gene_relative.shape

(779, 27922)

# 4. Pairwise Linear Correlation


In [6]:
import pickle as pkl

file_path = 'data/output/selected_genes_g2.pkl'.format(col.split('_')[-1].lower())
    
with open(file_path, 'rb') as file:
    selected_genes = pkl.load(file)
    
gene_relative_selected = gene_relative[selected_genes]

# 5. Helpfull Functions

In [7]:
def generate_metric(t, auc, tn, fp, fn, tp, title='THERAPY'):
    
    sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

    precision =  (tp / float(tp + fp)) if tp + fp > 0 else 1

    specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

    ks = abs(sensitivity + specificity - 1.)

    ifp = (float(tp + fp) / tp) if tp > 0 else -np.inf

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    row = pd.DataFrame({title: [t], 'AUC': auc, 'Overall Accuracy': accuracy, 
                        'Precision': precision, 'Sensitivity': sensitivity, 'Specificity': specificity,
                        'KS': ks, 'IFP': ifp})
    
    return row    

In [8]:
from scipy.special import erfinv

class GaussRankScaler():

    def __init__( self ):
        self.epsilon = 0.001
        self.lower = -1 + self.epsilon
        self.upper = 1 - self.epsilon
        self.range = self.upper - self.lower
        self.mean = None

    def fit_transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )
        
        self.mean = np.mean(X, axis=0)

        return transformed - self.mean
    
    def transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )

        return transformed - self.mean

In [9]:
def optimize_threshold(y_true, y_):

    t, max_metric = None, -np.inf

    for i in np.arange(0.00, max(y_), 0.01):

        y_hat = np.copy(y_)

        filter__ = y_hat >= i

        y_hat[filter__], y_hat[~filter__] = 1, 0

        tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

        sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

        specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

        ks = abs(sensitivity + specificity - 1.)
        
        auc = roc_auc_score(y_true, y_hat)
        
        metric = ks

        if metric > max_metric and metric is not np.inf:

            max_metric = metric

            t = i

    return t

In [10]:
if 'FIRST-LINE-THERAPY' in iss_fish_vars.columns:
    
    therapy = pd.get_dummies(iss_fish_vars['FIRST-LINE-THERAPY'])
    
    del iss_fish_vars['FIRST-LINE-THERAPY']
    
baseline_dataset = pd.DataFrame(response_var).join(therapy, how='inner').join(gene_relative_selected, how='inner')

baseline_dataset.shape

(495, 1828)

In [11]:
for col in therapy:
    print(col)
    

Bor
Bor-Cyc-Dex
Bor-Cyc-Dex+Bor-Dex
Bor-Dex
Bor-Dex+Bor
Bor-Dex+Bor-Cyc-Dex
Bor-Dex+Bor-Len-Dex
Bor-Dex+Bor-Len-Dex+Len
Bor-Len-Dex
Bor-Len-Dex+Bor-Dex
Bor-Len-Dex+Len
Len
Len-Dex
Len-Dex+Bor-Len-Dex


# ISS OR Fish Only

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result = None

for additional_column in iss_fish_vars:

    values = iss_fish_vars[additional_column]
    
    if values.dtype == 'object':
        values = pd.get_dummies(values)
    else:
        values = values.fillna(0)
    
    all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(values, how='inner').fillna(0).as_matrix()
    
    x = all_[:,1:]
    y = all_[:,0]
    
    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        s = GaussRankScaler()

        x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

        x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': -1,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t
        
        row['addc'] = additional_column

        result = row if result is None else pd.concat([result, row])
        
        break

del result['Fold']
        
result.set_index('addc')

Unnamed: 0_level_0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
addc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
STAGE,0.602339,3.25,0.128655,0.626667,0.307692,0.444444,0.684211,0.28
DEL13Q14,0.596491,2.875,0.181287,0.666667,0.347826,0.444444,0.736842,0.29
DEL13Q34,0.620858,3.222222,0.149123,0.613333,0.310345,0.5,0.649123,0.28
DEL17P13,0.540936,3.75,0.02924,0.666667,0.266667,0.222222,0.807018,0.33
AGE,0.538012,2.625,0.216374,0.693333,0.380952,0.444444,0.77193,0.32
RACE,0.564327,3.125,0.146199,0.64,0.32,0.444444,0.701754,0.28
GAIN1Q21,0.602339,3.5,0.070175,0.64,0.285714,0.333333,0.736842,0.3
T11-14_CCND1,0.548733,4.2,0.002924,0.613333,0.238095,0.277778,0.719298,0.28
T12-14_CCND2,0.641326,3.0,0.143275,0.666667,0.333333,0.388889,0.754386,0.29
T14-16_MAF,0.576023,2.6,0.274854,0.68,0.384615,0.555556,0.719298,0.25


# Gene Expressions Only

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

model_id = datetime.now().strftime('%Y%m%d%H%M%S')

result, y_hat, y_true, index = None, [], [], []

all_ = baseline_dataset.as_matrix()

x = all_[:,1:]
y = all_[:,0]


for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    y_ = [int(y >= t) for y in y_]
    
    tn, fp, fn, tp = confusion_matrix(y_valid, y_).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    y_hat  += list(y_)

    y_true += list(y_valid)

    index  += list(valid_index)
    
    break

del result['Fold']
result.index = ['baseline']
result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
baseline,0.655702,3.0,0.214912,0.62,0.333333,0.583333,0.631579,0.11


# CLINICAL + GENE

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars[['STAGE', 'RACE', 'AGE', 'DPRT']]:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']

result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
all,0.67154,2.625,0.216374,0.693333,0.380952,0.444444,0.77193,0.29


# CLINICAL + GENE - AGE

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars[['STAGE', 'RACE', 'DPRT']]:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']
result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
all,0.712476,2.2,0.345029,0.733333,0.454545,0.555556,0.789474,0.29


# CLINICAL AND FISH

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']
result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
all,0.631579,2.166667,0.210526,0.746667,0.461538,0.333333,0.877193,0.35


# GENE + ISS OR FISH Only

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result = None

for additional_column in iss_fish_vars:

    values = iss_fish_vars[additional_column]
    
    if values.dtype == 'object':
        values = pd.get_dummies(values)
    else:
        values = values.fillna(0)
    
    all_ = baseline_dataset.join(values, how='inner').fillna(0).as_matrix()
    
    x = all_[:,1:]
    y = all_[:,0]
    
    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        s = GaussRankScaler()

        x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

        x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': -1,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t
        
        row['addc'] = additional_column

        result = row if result is None else pd.concat([result, row])
        
        break

del result['Fold']
        
result.set_index('addc')

Unnamed: 0_level_0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
addc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
STAGE,0.664474,2.571429,0.29386,0.68,0.388889,0.583333,0.710526,0.12
DEL13Q14,0.653509,3.0,0.214912,0.62,0.333333,0.583333,0.631579,0.11
DEL13Q34,0.655702,2.714286,0.267544,0.66,0.368421,0.583333,0.684211,0.12
DEL17P13,0.677632,2.625,0.324561,0.66,0.380952,0.666667,0.657895,0.12
AGE,0.688596,2.857143,0.241228,0.64,0.35,0.583333,0.657895,0.11
RACE,0.651316,2.875,0.27193,0.62,0.347826,0.666667,0.605263,0.11
GAIN1Q21,0.655702,2.714286,0.267544,0.66,0.368421,0.583333,0.684211,0.12
T11-14_CCND1,0.657895,3.0,0.214912,0.62,0.333333,0.583333,0.631579,0.11
T12-14_CCND2,0.662281,3.142857,0.188596,0.6,0.318182,0.583333,0.605263,0.11
T14-16_MAF,0.662281,3.142857,0.188596,0.6,0.318182,0.583333,0.605263,0.11


# GENE + CLINICAL + FISH

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']
result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
all,0.765351,2.111111,0.486842,0.74,0.473684,0.75,0.736842,0.13


# GENE + CLINICAL + FISH - AGE

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']

result

Unnamed: 0,AUC,IFP,KS,Overall Accuracy,Precision,Sensitivity,Specificity,Threshold
all,0.723684,2.75,0.298246,0.64,0.363636,0.666667,0.631579,0.13


# 5. Simulation

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

simulation, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).as_matrix()

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    s = GaussRankScaler()

    x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

    x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    row = therapy.iloc[valid_index,:].copy()
    
    row['THERAPY'] = therapy.iloc[valid_index,:].idxmax(axis=1)
    
    for lll in range(0, 14):
        #print(therapy.columns[lll-1])
        
        for kkk in range(0, 14):
            x_valid[:,kkk] = int(kkk == lll)
        y_ = gbm.predict(x_valid)
        
        row.iloc[:,lll] = y_
        
    row['NEW_THERAPY'] = row.iloc[:,0:14].idxmax(axis=1)
    
    row['y'] = y_valid
        
    
    #
    #
    #
    #auc = roc_auc_score(y_valid, y_)

    #t = optimize_threshold(y_train, gbm.predict(x_train))

    #tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    #row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    #row['Threshold'] = t

    simulation = row if simulation is None else pd.concat([simulation, row])

#del result['Fold']
        
#result.index = ['all']

qgrid.show_grid(simulation, grid_options=grid_options)

QgridWidget(grid_options={'enableColumnReorder': False, 'defaultColumnWidth': 150, 'editable': True, 'rowHeigh…

In [20]:
x = round((simulation['THERAPY'] != simulation['NEW_THERAPY']).sum() / simulation.shape[0] * 100, 2)

print('{}%'.format(x))

90.3%


In [21]:
simulation.to_csv('data/output/simulation.csv', sep=',', index=True)