# 1. Load and Format Data

This block aims at loading and formatting our iss and fish variables. Our id corresponde to the MMRF identifier.

In [1]:
import pandas as pd
import qgrid

# grid options
grid_options = {'forceFitColumns': False}

# loading iss and fish variables
iss_fish_vars = pd.read_csv('data/iss_fish_therapy_response.csv', sep='\t')

# all column name to upper case
iss_fish_vars.columns = [col.upper() for col in iss_fish_vars.columns]

# Renaming ID columns
iss_fish_vars = iss_fish_vars.rename(columns={'MMRF': 'ID'})

# Renaming ISS to Stage
iss_fish_vars = iss_fish_vars.rename(columns={'ISS': 'STAGE'})

# transforming mmrf ids to integers
iss_fish_vars['ID'] = iss_fish_vars['ID'].str.replace('MMRF', '').astype(int)

# setting index
iss_fish_vars = iss_fish_vars.set_index('ID')

# stage string to int
iss_fish_vars['STAGE'] = iss_fish_vars['STAGE'].map({'I': 1, 'II': 2, 'III': 3})

# count initial sample
initial_sample = iss_fish_vars.shape[0]

# drop invalid patients
iss_fish_vars = iss_fish_vars.dropna(subset=['BEST-RESPONSE-FIRSTLINE', 'FIRST-LINE-THERAPY'])

# count sample after dropping patients without therapy or response
therapy_and_response_sample = iss_fish_vars.shape

# removing unused variables
for col in ['DAYS-TO-OVERALL-SURVIVAL', 'DAYS-TO-PROGRESSION', 
            'PATIENT-FIRSTRESPONSE', 'FIRST-LINE-THERAPY-CLASS', 'FIRST-LINE-STARTING-TREATMENT-REGIMEN']:
    del iss_fish_vars[col]

qgrid.show_grid(iss_fish_vars, grid_options=grid_options)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

# 2. Format and Group the Response Variable

We group our response variable, a description of the patient's theray response, based on clinical protocols described by doctors specialized in Oncology.

In [2]:
%matplotlib inline

groups = {1: (['SCR'], ['CR', 'VGPR', 'PR', 'SD', 'PD']),
          2: (['SCR', 'CR'], ['VGPR', 'PR', 'SD', 'PD']),
          3: (['SCR', 'CR', 'VGPR'], ['PR', 'SD', 'PD']),
          4: (['SCR', 'CR', 'VGPR', 'PR'], ['SD', 'PD']),
          5: (['SCR', 'CR', 'VGPR', 'PR', 'SD'], ['PD']),
          6: (['SCR'], ['CR', 'VGPR'], ['PR', 'SD', 'PD']),
          7: (['SCR'], ['CR', 'VGPR', 'PR'], ['SD', 'PD'])}

selected_group = 2

# split response variable and drop it from iss and fish variables
if 'BEST-RESPONSE-FIRSTLINE' in  iss_fish_vars.columns:
    
    response_var = iss_fish_vars['BEST-RESPONSE-FIRSTLINE']
    
    del iss_fish_vars['BEST-RESPONSE-FIRSTLINE'] 

    response_var = response_var.apply(lambda x: 1 if x in groups[selected_group][0] else 0)

# plot count per class
print('Count per class')
    
for i, j in pd.DataFrame(response_var).groupby(by='BEST-RESPONSE-FIRSTLINE').apply(lambda x: len(x)).iteritems():
    print('{}: {}'.format(i, j))


Count per class
0: 566
1: 175


# 3. Load and Join Gene Expressions

In [3]:
# loading gene counts
gene_fpkm = pd.read_csv('data/gene_fpkm.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_fpkm.columns:
    if '_1_' not in col:
        del gene_fpkm[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_fpkm = gene_fpkm.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_fpkm.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_fpkm.index = [int(col.split('_')[1]) for col in gene_fpkm.index]

# selected class
gene_details = pd.read_csv('data/gene_details.tsv', sep='\t')

gene_selected_class = pd.read_csv('data/gene_selected_class.tsv', sep='\t')

gene_selected_class = gene_details.merge(gene_selected_class, on='gene_biotype').set_index('ensembl_gene_id')

gene_selected_class = [gen for gen in gene_selected_class.index if gen in gene_fpkm.columns]

gene_fpkm = gene_fpkm[gene_selected_class]

# removing genes with zero sum
gene_fpkm = gene_fpkm[list(gene_fpkm.sum(axis=0).index[(gene_fpkm.sum(axis=0) > 0).tolist()])]

gene_fpkm.shape

(779, 26069)

In [4]:
gene_fpkm.iloc[:10,:10]

GENE_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167
2438,0.645325,0.0,32.1035,7.6691,7.0602,0.719675,15.3955,50.1926,11.3271,18.6944
1786,3.51915,0.0,29.3108,4.96608,1.50599,0.705555,1.28339,22.9092,7.74184,12.5095
1332,6.87306,0.0,41.2908,8.25159,4.64106,0.927851,0.072551,46.1312,13.6712,18.6234
2562,5.4415,0.0,27.8386,8.23226,2.71763,0.415857,0.064742,14.3873,15.6407,17.7199
1797,0.0,0.0,36.8206,7.72755,3.28094,0.220043,0.0,17.0595,17.2859,13.6723
1861,0.404031,0.0,41.0652,5.0433,1.54744,49.9608,1.26737,9.27582,8.13927,15.7142
1823,3.68256,0.0,60.5635,2.57942,1.71766,0.5457,0.165141,38.004,9.23173,15.483
2018,0.795386,0.0,29.0764,4.87317,2.25814,0.311901,0.685952,14.9834,11.3156,13.9395
2268,2.71476,0.491686,31.3695,2.17873,2.26555,172.304,1.23557,21.4735,11.2765,10.5836
2570,0.124405,0.0,39.5649,1.60298,1.19724,0.925557,0.222275,17.3529,5.64112,6.85792


# 4. Pairwise Linear Correlation


In [5]:
import pickle as pkl

file_path = 'data/output/selected_genes_g2.pkl'.format(col.split('_')[-1].lower())
    
with open(file_path, 'rb') as file:
    selected_genes = pkl.load(file)
    
gene_fpkm_selected = gene_fpkm[[gen for gen in selected_genes if gen in gene_fpkm.columns]]

gene_fpkm_selected.shape

(779, 1711)

In [43]:
gene_fpkm_selected.head()

GENE_ID,ENSG00000000005,ENSG00000000419,ENSG00000001561,ENSG00000001617,ENSG00000002586,ENSG00000002745,ENSG00000003393,ENSG00000005007,ENSG00000005379,ENSG00000005448,...,ENSG00000264329,ENSG00000264771,ENSG00000264810,ENSG00000265225,ENSG00000265345,ENSG00000265390,ENSG00000265530,ENSG00000266143,ENSG00000266440,ENSG00000266547
2438,0.0,32.1035,14.9621,0.135902,1.22782,0.979523,5.30996,25.6926,1.46663,14.4932,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1786,0.0,29.3108,11.2517,0.01547,1.52024,0.240436,6.30008,23.1781,3.10792,10.7491,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332,0.0,41.2908,20.2348,0.004601,8.3797,0.316897,7.70273,17.8,12.9284,14.9347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2562,0.0,27.8386,20.7619,0.036075,0.615088,1.03483,3.8166,28.8874,1.8297,13.0608,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1797,0.0,36.8206,5.62598,0.014171,76.2685,0.533943,14.2224,27.657,3.7765,21.59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 5. Helpfull Functions

In [6]:
def generate_metric(t, auc, tn, fp, fn, tp, title='THERAPY'):
    
    sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

    precision =  (tp / float(tp + fp)) if tp + fp > 0 else 1

    specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

    ks = abs(sensitivity + specificity - 1.)

    ifp = (float(tp + fp) / tp) if tp > 0 else -np.inf

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    row = pd.DataFrame({title: [t], 'AUC': auc, 'Overall Accuracy': accuracy, 
                        'Precision': precision, 'Sensitivity': sensitivity, 'Specificity': specificity,
                        'KS': ks, 'IFP': ifp})
    
    return row    

In [7]:
def optimize_threshold(y_true, y_):

    t, max_metric = None, -np.inf

    for i in np.arange(0.00, max(y_), 0.01):

        y_hat = np.copy(y_)

        filter__ = y_hat >= i

        y_hat[filter__], y_hat[~filter__] = 1, 0

        tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

        sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

        specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

        ks = abs(sensitivity + specificity - 1.)
        
        auc = roc_auc_score(y_true, y_hat)
        
        metric = ks

        if metric > max_metric and metric is not np.inf:

            max_metric = metric

            t = i

    return t

In [8]:
if 'FIRST-LINE-THERAPY' in iss_fish_vars.columns:
    
    therapy = pd.get_dummies(iss_fish_vars['FIRST-LINE-THERAPY'])
    
    del iss_fish_vars['FIRST-LINE-THERAPY']
    
baseline_dataset = pd.DataFrame(response_var).join(therapy, how='inner').join(gene_fpkm_selected, how='inner')

baseline_dataset.shape

(495, 1726)

In [9]:
for col in therapy:
    print(col)
    

Bor
Bor-Cyc-Dex
Bor-Cyc-Dex+Bor-Dex
Bor-Dex
Bor-Dex+Bor
Bor-Dex+Bor-Cyc-Dex
Bor-Dex+Bor-Len-Dex
Bor-Dex+Bor-Len-Dex+Len
Bor-Len-Dex
Bor-Len-Dex+Bor-Dex
Bor-Len-Dex+Len
Len
Len-Dex
Len-Dex+Bor-Len-Dex


In [21]:
translocation = iss_fish_vars.loc[:,['DEL13Q14', 'DEL13Q34', 'DEL17P13', 'GAIN1Q21', 'T11-14_CCND1', 'T12-14_CCND2', 'T14-16_MAF', 'T14-20_MAFB', 'T4-14_WHSC1', 'T6-14_CCND3', 'T8-14_MAFA', 'T8-14_MYC']]

translocation.head()

Unnamed: 0_level_0,DEL13Q14,DEL13Q34,DEL17P13,GAIN1Q21,T11-14_CCND1,T12-14_CCND2,T14-16_MAF,T14-20_MAFB,T4-14_WHSC1,T6-14_CCND3,T8-14_MAFA,T8-14_MYC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1011,,,,,,,,,,,,
1013,,,,,,,,,,,,
1014,,,,,,,,,,,,
1016,B,B,ND,R,ND,ND,ND,ND,R,ND,ND,ND
1017,,,,,,,,,,,,


# GENE + CLINICAL + FISH - AGE

In [25]:
import numpy as np

for c in translocation.columns:
    translocation[c] = translocation[c].replace({'B': 1, 'ND': 0, 'R': 1})
translocation.iloc[:20,:]

Unnamed: 0_level_0,DEL13Q14,DEL13Q34,DEL17P13,GAIN1Q21,T11-14_CCND1,T12-14_CCND2,T14-16_MAF,T14-20_MAFB,T4-14_WHSC1,T6-14_CCND3,T8-14_MAFA,T8-14_MYC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1011,,,,,,,,,,,,
1013,,,,,,,,,,,,
1014,,,,,,,,,,,,
1016,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1017,,,,,,,,,,,,
1018,,,,,,,,,,,,
1020,,,,,,,,,,,,
1021,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1029,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
baseline_dataset.head()

Unnamed: 0,BEST-RESPONSE-FIRSTLINE,Bor,Bor-Cyc-Dex,Bor-Cyc-Dex+Bor-Dex,Bor-Dex,Bor-Dex+Bor,Bor-Dex+Bor-Cyc-Dex,Bor-Dex+Bor-Len-Dex,Bor-Dex+Bor-Len-Dex+Len,Bor-Len-Dex,...,ENSG00000264329,ENSG00000264771,ENSG00000264810,ENSG00000265225,ENSG00000265345,ENSG00000265390,ENSG00000265530,ENSG00000266143,ENSG00000266440,ENSG00000266547
1021,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1030,1,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1031,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1032,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
trans.unique()

array([1, 0], dtype=object)

In [44]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(5, random_state=13)

result, detailed_result, addc = None, None, None

for fish in translocation.columns:
    
    try:
        
        trans = translocation[fish].dropna().astype(int)
        print(trans.shape)
        all_ = pd.DataFrame(trans).join(gene_fpkm_selected, how='inner').fillna(0).values
        
        x, y = all_[:,1:], all_[:,0]

        for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

            x_train, y_train = x[train_index,:], y[train_index].ravel()

            x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

            lgb_train = lgb.Dataset(x_train, y_train)

            params = {'boosting_type': 'gbdt', 
                      'objective': 'binary',
                      'num_class': 1,
                      'metric': 'logloss',
                      'learning_rate': 0.01, 
                      'num_leaves': 31, 
                      'max_depth': -1,  
                      'min_child_samples': 20, 
                      'max_bin': 255,  
                      'subsample': 0.8, 
                      'subsample_freq': 0,  
                      'colsample_bytree': 0.3,  
                      'min_child_weight': 5, 
                      'subsample_for_bin': 200000,
                      'min_split_gain': 0, 
                      'reg_alpha': 0, 
                      'reg_lambda': 0, 
                      'nthread': 6, 
                      'verbose': 0}

            gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

            y_ = gbm.predict(x_valid)

            #
            #
            #
            auc = roc_auc_score(y_valid, y_)

            t = optimize_threshold(y_train, gbm.predict(x_train))

            tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

            row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

            row['Threshold'] = t
            
            row['RESP'] = fish

            result = row if result is None else pd.concat([result, row])


            #
            #
            #
            scores = pd.DataFrame({'fold': i + 1, 'y': y_valid, 'y_hat': y_, 'y_opt': [int(y >= t) for y in y_]})

            detailed_result = scores if detailed_result is None else pd.concat([detailed_result, scores])
    except:
        pass
    
detailed_result.to_csv('data/output/best_model_estimations.csv', sep=',', index=False)

result = result.set_index('Fold')

result

(547,)
(547,)
(547,)
(547,)
(538,)
(538,)




(538,)
(538,)
(538,)
(538,)
(538,)
(538,)


Unnamed: 0_level_0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold,RESP
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.982305,0.921348,0.933333,0.913043,0.930233,0.843276,1.071429,0.31,DEL13Q14
2,0.973016,0.91954,0.913043,0.933333,0.904762,0.838095,1.095238,0.32,DEL13Q14
3,0.967725,0.885057,0.87234,0.911111,0.857143,0.768254,1.146341,0.29,DEL13Q14
4,0.949206,0.91954,0.931818,0.911111,0.928571,0.839683,1.073171,0.31,DEL13Q14
5,0.980423,0.942529,0.916667,0.977778,0.904762,0.88254,1.090909,0.19,DEL13Q14
1,0.97561,0.920455,0.904762,0.926829,0.914894,0.841723,1.105263,0.26,DEL13Q34
2,0.939803,0.875,0.840909,0.902439,0.851064,0.753503,1.189189,0.33,DEL13Q34
3,0.932662,0.862069,0.853659,0.853659,0.869565,0.723224,1.171429,0.32,DEL13Q34
4,0.983033,0.931034,0.888889,0.97561,0.891304,0.866914,1.125,0.27,DEL13Q34
5,0.946448,0.908046,0.851064,0.97561,0.847826,0.823436,1.175,0.34,DEL13Q34


In [47]:
all_ = pd.DataFrame(trans).join(gene_fpkm_selected, how='inner').fillna(0)

for x in all_.columns:
    print(x)

T8-14_MYC
ENSG00000000005
ENSG00000000419
ENSG00000001561
ENSG00000001617
ENSG00000002586
ENSG00000002745
ENSG00000003393
ENSG00000005007
ENSG00000005379
ENSG00000005448
ENSG00000005700
ENSG00000006128
ENSG00000006611
ENSG00000007255
ENSG00000007541
ENSG00000008086
ENSG00000008226
ENSG00000008283
ENSG00000010030
ENSG00000010244
ENSG00000010361
ENSG00000011677
ENSG00000013375
ENSG00000013392
ENSG00000014216
ENSG00000015133
ENSG00000019186
ENSG00000019582
ENSG00000019991
ENSG00000021762
ENSG00000023445
ENSG00000025156
ENSG00000025796
ENSG00000026025
ENSG00000027847
ENSG00000028277
ENSG00000029363
ENSG00000033011
ENSG00000033327
ENSG00000039537
ENSG00000040608
ENSG00000042493
ENSG00000043143
ENSG00000044574
ENSG00000046647
ENSG00000046653
ENSG00000048649
ENSG00000050327
ENSG00000051108
ENSG00000052749
ENSG00000054967
ENSG00000058453
ENSG00000058799
ENSG00000060491
ENSG00000060762
ENSG00000063127
ENSG00000064199
ENSG00000064201
ENSG00000064300
ENSG00000065320
ENSG00000065457
ENSG0000006561

ENSG00000203327
ENSG00000203446
ENSG00000204623
ENSG00000206417
ENSG00000223343
ENSG00000223356
ENSG00000223374
ENSG00000223635
ENSG00000223669
ENSG00000223821
ENSG00000223923
ENSG00000223960
ENSG00000223991
ENSG00000224066
ENSG00000224086
ENSG00000224220
ENSG00000224298
ENSG00000224301
ENSG00000224384
ENSG00000224505
ENSG00000224666
ENSG00000224731
ENSG00000225203
ENSG00000225302
ENSG00000225437
ENSG00000225439
ENSG00000225602
ENSG00000225721
ENSG00000225762
ENSG00000225792
ENSG00000225979
ENSG00000226009
ENSG00000226252
ENSG00000226310
ENSG00000226352
ENSG00000226416
ENSG00000226455
ENSG00000226496
ENSG00000226746
ENSG00000226819
ENSG00000226919
ENSG00000227014
ENSG00000227053
ENSG00000227145
ENSG00000227215
ENSG00000227415
ENSG00000228010
ENSG00000228084
ENSG00000228288
ENSG00000228417
ENSG00000228439
ENSG00000228486
ENSG00000228775
ENSG00000228802
ENSG00000228988
ENSG00000229036
ENSG00000229043
ENSG00000229154
ENSG00000229258
ENSG00000229278
ENSG00000229418
ENSG00000229525
ENSG0000

In [45]:
result.groupby('RESP').mean()

Unnamed: 0_level_0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
RESP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DEL13Q14,0.970535,0.917603,0.91344,0.929275,0.905094,0.834369,1.095418,0.284
DEL13Q34,0.955511,0.899321,0.867856,0.926829,0.874931,0.80176,1.153176,0.304
DEL17P13,0.852509,0.876484,0.4675,0.608889,0.909757,0.518646,2.205556,0.13
GAIN1Q21,0.976563,0.928914,0.878306,0.928602,0.929135,0.857737,1.14092,0.286
T11-14_CCND1,0.976558,0.958003,0.908039,0.853333,0.980241,0.833575,1.106094,0.224
T12-14_CCND2,0.5,0.011494,0.011494,1.0,0.0,0.0,87.0,0.0
T14-16_MAF,0.96687,0.962709,0.586667,0.716667,0.973171,0.689837,1.933333,0.184
T14-20_MAFB,0.5,0.018606,0.018606,1.0,0.0,0.0,59.9,0.0
T4-14_WHSC1,0.998034,0.981339,0.923333,0.945455,0.986595,0.932049,1.090404,0.3
T6-14_CCND3,0.5,0.013955,0.013955,1.0,0.0,0.0,77.1,0.0


In [48]:
len(list(gene_fpkm_selected.columns))

1711

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]


kfold = StratifiedKFold(10, random_state=13)

result, detailed_result, addc = {}, None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x, y = all_[:,1:], all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    x_train, y_train = x[train_index,:], y[train_index].ravel()
    
    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    for name, clf in zip(names, classifiers):
    
        if name not in result:
            result[name] = []
    
        clf.fit(x_train, y_train)
        
        y_ = clf.predict(x_valid)
        
        auc = roc_auc_score(y_valid, y_)
        
        result[name].append(auc)

result = pd.DataFrame(result)

result = result.unstack().reset_index()

result.columns = ['algorithm', 'fold', 'auc']

result.to_csv('data/output/baseline.csv', sep=',', index=False)

result

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, detailed_result, addc = None, None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    x_train, y_train = x[train_index,:], y[train_index].ravel()
    
    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)
    
    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])
    
    
    #
    #
    #
    scores = pd.DataFrame({'fold': i + 1, 'y': y_valid, 'y_hat': y_, 'y_opt': [int(y >= t) for y in y_]})
    
    detailed_result = scores if detailed_result is None else pd.concat([detailed_result, scores])
    
detailed_result.to_csv('data/output/best_model_estimations.csv', sep=',', index=False)

result = result.set_index('Fold')

result

In [None]:
a = pd.concat([result.mean(axis=0), result.std(axis=0)], axis=1)
a.columns = ['mean', 'std']
a.T

# 5. Simulation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

simulation, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    row = therapy.iloc[valid_index,:].copy()
    
    row['THERAPY'] = therapy.iloc[valid_index,:].idxmax(axis=1)
    
    for lll in range(0, 14):
        
        for kkk in range(0, 14):
            x_valid[:,kkk] = int(kkk == lll)
        y_ = gbm.predict(x_valid)
        
        row.iloc[:,lll] = y_
        
    row['NEW_THERAPY'] = row.iloc[:,0:14].idxmax(axis=1)
    
    row['y'] = y_valid
        
    
    #
    #
    #
    #auc = roc_auc_score(y_valid, y_)

    #t = optimize_threshold(y_train, gbm.predict(x_train))

    #tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    #row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    #row['Threshold'] = t

    simulation = row if simulation is None else pd.concat([simulation, row])

#del result['Fold']
        
#result.index = ['all']

simulation

In [None]:
x = round((simulation['THERAPY'] != simulation['NEW_THERAPY']).sum() / simulation.shape[0] * 100, 2)

print('{}%'.format(x))

In [None]:
simulation.to_csv('data/output/simulation.csv', sep=',', index=True)