In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from pathlib import Path

DATA = Path("/content/drive/My Drive/data/mini")

In [4]:
!pip install lifelines

Collecting lifelines
[?25l  Downloading https://files.pythonhosted.org/packages/f2/d9/b1cbebdd7ae6434ff1cc6c57e2f469bd6870d684f690409644f7f802686c/lifelines-0.24.6-py3-none-any.whl (326kB)
[K     |█                               | 10kB 17.3MB/s eta 0:00:01[K     |██                              | 20kB 3.2MB/s eta 0:00:01[K     |███                             | 30kB 4.2MB/s eta 0:00:01[K     |████                            | 40kB 4.4MB/s eta 0:00:01[K     |█████                           | 51kB 3.6MB/s eta 0:00:01[K     |██████                          | 61kB 4.1MB/s eta 0:00:01[K     |███████                         | 71kB 4.3MB/s eta 0:00:01[K     |████████                        | 81kB 4.7MB/s eta 0:00:01[K     |█████████                       | 92kB 5.0MB/s eta 0:00:01[K     |██████████                      | 102kB 4.8MB/s eta 0:00:01[K     |███████████                     | 112kB 4.8MB/s eta 0:00:01[K     |████████████                    | 122kB 4.8MB/s e

In [5]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/b5/26/9842333adbb8f17bcb3d699400a8b1ccde0af0b6de8d07224e183728acdf/bayesian_optimization-1.1.0-py3-none-any.whl
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.1.0


In [6]:
!pip install scikit-learn



### Training data

In [7]:
import pandas as pd

pfs_train = pd.read_csv(DATA / "pfs_train.txt", sep="\t", index_col="patient_id")
pfs_train.head()

Unnamed: 0_level_0,progression,time_to_progression_or_censor
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TRAIN_0000,1.0,334.0
TRAIN_0001,0.0,1523.0
TRAIN_0002,0.0,121.0
TRAIN_0003,1.0,334.0
TRAIN_0004,1.0,183.0


In [8]:
pfs_train.shape

(816, 2)

### Clinical data

m-metasthesis

n-lymph nodes

t-tumor size

pathologic stage combines the three above.

In [9]:
clin_train = pd.read_csv(DATA / "clinical_train.txt", sep="\t", index_col="patient_id")
clin_test = pd.read_csv(DATA / "clinical_test.txt", sep="\t", index_col="patient_id")

clin_train.head()

Unnamed: 0_level_0,age_at_initial_pathologic_diagnosis,anatomic_neoplasm_subdivision,diagnosis,eastern_cancer_oncology_group,ethnicity,gender,karnofsky_performance_score,laterality,location_in_lung_parenchyma,number_pack_years_smoked,pathologic_M,pathologic_N,pathologic_T,pathologic_stage,race,radiation_therapy,residual_tumor,tobacco_smoking_history
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TRAIN_0000,81,L-Upper,Lung Adenocarcinoma,[Not Available],[Not Available],MALE,[Not Available],[Not Available],[Not Available],32,M0,N2,T2,Stage IIIA,[Not Available],[Not Available],R2,4
TRAIN_0001,67,R-Lower,Lung Adenocarcinoma,[Not Available],[Not Available],MALE,[Not Available],[Not Available],Peripheral Lung,52,M0,N0,T2,Stage IB,[Not Available],[Not Available],R0,3
TRAIN_0002,79,R-Lower,Lung Adenocarcinoma,[Not Available],[Not Available],FEMALE,[Not Available],[Not Available],[Not Available],47,M0,N1,T3,Stage IIIA,[Not Available],[Not Available],R2,4
TRAIN_0003,68,L-Upper,Lung Adenocarcinoma,[Not Available],[Not Available],MALE,[Not Available],[Not Available],[Not Available],62,M0,N0,T2,Stage IB,[Not Available],[Not Available],R0,4
TRAIN_0004,66,R-Lower,Lung Adenocarcinoma,[Not Available],[Not Available],MALE,[Not Available],[Not Available],[Not Available],20,M0,N2,T2,Stage IIIA,[Not Available],[Not Available],RX,3


In [10]:
clin_train.shape

(816, 18)

In [11]:
clin_test.shape

(191, 18)

In [12]:
# convert `[Not Available]`, `[Unknown]`, `[Not Evaluable]` to `np.NaN`

import numpy as np

def replace_by_na(col: pd.Series, char: str = "["):
    idx = [char in str(x) for x in col]
    _col = col.copy()
    _col.loc[idx] = np.NaN
    return _col

for col in clin_train.columns:
    clin_train[col] = replace_by_na(clin_train[col])
    clin_test[col] = replace_by_na(clin_test[col])

clin_train.head()

Unnamed: 0_level_0,age_at_initial_pathologic_diagnosis,anatomic_neoplasm_subdivision,diagnosis,eastern_cancer_oncology_group,ethnicity,gender,karnofsky_performance_score,laterality,location_in_lung_parenchyma,number_pack_years_smoked,pathologic_M,pathologic_N,pathologic_T,pathologic_stage,race,radiation_therapy,residual_tumor,tobacco_smoking_history
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TRAIN_0000,81,L-Upper,Lung Adenocarcinoma,,,MALE,,,,32,M0,N2,T2,Stage IIIA,,,R2,4
TRAIN_0001,67,R-Lower,Lung Adenocarcinoma,,,MALE,,,Peripheral Lung,52,M0,N0,T2,Stage IB,,,R0,3
TRAIN_0002,79,R-Lower,Lung Adenocarcinoma,,,FEMALE,,,,47,M0,N1,T3,Stage IIIA,,,R2,4
TRAIN_0003,68,L-Upper,Lung Adenocarcinoma,,,MALE,,,,62,M0,N0,T2,Stage IB,,,R0,4
TRAIN_0004,66,R-Lower,Lung Adenocarcinoma,,,MALE,,,,20,M0,N2,T2,Stage IIIA,,,RX,3


In [51]:
# feature selection based on the literature and later machine-learning analysis

# # original selection
# cat_cols = ["diagnosis", "location_in_lung_parenchyma", "pathologic_M",
#             "pathologic_N", "pathologic_T", "radiation_therapy",
#             "residual_tumor"]
# cont_cols = ["age_at_initial_pathologic_diagnosis", "number_pack_years_smoked"]

# new selection
cat_cols = ["diagnosis", "gender", "pathologic_M", "pathologic_T", 
            "radiation_therapy", "residual_tumor"]
cont_cols = []

# One-hot encoding
x_clin_train = pd.concat([pd.get_dummies(clin_train[cat_cols]),
                          clin_train[cont_cols]], axis=1, sort=True)

x_clin_test = pd.concat([pd.get_dummies(clin_test[cat_cols]),
                         clin_test[cont_cols]], axis=1, sort=True)

x_clin_train.head()

Unnamed: 0_level_0,diagnosis_Lung Adenocarcinoma,diagnosis_Lung Squamous Cell Carcinoma,gender_FEMALE,gender_MALE,pathologic_M_M0,pathologic_M_M1,pathologic_M_M1a,pathologic_M_M1b,pathologic_M_MX,pathologic_T_T1,pathologic_T_T1a,pathologic_T_T1b,pathologic_T_T2,pathologic_T_T2a,pathologic_T_T2b,pathologic_T_T3,pathologic_T_T4,pathologic_T_TX,radiation_therapy_NO,radiation_therapy_YES,residual_tumor_R0,residual_tumor_R1,residual_tumor_R2,residual_tumor_RX
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
TRAIN_0000,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
TRAIN_0001,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
TRAIN_0002,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
TRAIN_0003,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
TRAIN_0004,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


### RNASeq data

These are already log2 transformed. 

In [14]:
ge_train = pd.read_csv(DATA / "rnaseq_train.txt", sep="\t", index_col=0)
ge_test = pd.read_csv(DATA / "rnaseq_test.txt", sep="\t", index_col=0)

ge_train.head()

Unnamed: 0,TRAIN_0001,TRAIN_0002,TRAIN_0003,TRAIN_0004,TRAIN_0005,TRAIN_0006,TRAIN_0007,TRAIN_0008,TRAIN_0009,TRAIN_0010,TRAIN_0011,TRAIN_0012,TRAIN_0013,TRAIN_0014,TRAIN_0015,TRAIN_0016,TRAIN_0017,TRAIN_0018,TRAIN_0019,TRAIN_0020,TRAIN_0021,TRAIN_0022,TRAIN_0023,TRAIN_0024,TRAIN_0025,TRAIN_0026,TRAIN_0027,TRAIN_0028,TRAIN_0029,TRAIN_0030,TRAIN_0031,TRAIN_0032,TRAIN_0033,TRAIN_0034,TRAIN_0035,TRAIN_0036,TRAIN_0037,TRAIN_0038,TRAIN_0039,TRAIN_0040,...,TRAIN_0776,TRAIN_0777,TRAIN_0778,TRAIN_0779,TRAIN_0780,TRAIN_0781,TRAIN_0782,TRAIN_0783,TRAIN_0784,TRAIN_0785,TRAIN_0786,TRAIN_0787,TRAIN_0788,TRAIN_0789,TRAIN_0790,TRAIN_0791,TRAIN_0792,TRAIN_0793,TRAIN_0794,TRAIN_0795,TRAIN_0796,TRAIN_0797,TRAIN_0798,TRAIN_0799,TRAIN_0800,TRAIN_0801,TRAIN_0802,TRAIN_0803,TRAIN_0804,TRAIN_0805,TRAIN_0806,TRAIN_0807,TRAIN_0808,TRAIN_0809,TRAIN_0810,TRAIN_0811,TRAIN_0812,TRAIN_0813,TRAIN_0814,TRAIN_0815
A1BG,-1.3364,-1.7668,-1.0198,-0.8983,-5.9363,0.3527,-3.0055,-2.3069,-2.1904,-1.2656,-1.3568,-0.7001,-2.8087,-1.079,-1.5845,-1.2057,-1.6855,-1.161,0.4233,-3.0186,-1.7536,-1.3744,-0.7374,-4.0285,-3.9233,-3.0484,-1.3048,-2.3104,-3.3131,-4.6724,-2.4712,-2.6891,-4.0212,-3.2784,-1.4477,-3.3057,-1.9225,-3.4339,-2.0238,-1.0988,...,-1.5283,-1.3231,-0.9713,-0.9121,-0.9251,-1.0919,0.2249,-0.6211,-3.3718,-2.0011,-0.9321,-1.0119,-1.3776,-1.4497,-2.0486,-3.0911,0.6557,0.4914,-0.1059,1.1786,-1.9261,-0.409,-0.3578,-1.9184,0.4643,0.5011,-3.6151,-2.0062,-1.6154,-1.8095,-0.3868,0.465,0.6863,-1.3412,-0.3093,-0.5301,-1.9589,-3.5359,-1.4197,-1.5915
A1BG-AS1,1.7801,0.7311,1.379,1.4652,-1.1814,2.4569,-0.3547,0.1567,-0.416,0.1432,1.0081,1.3865,0.2076,0.8685,-0.485,1.6472,1.0725,2.1088,1.9816,-0.6165,-0.1687,1.0956,2.0803,-0.5321,-0.7533,0.3515,0.6253,1.5405,0.9846,-1.128,0.6834,0.9465,-1.7841,0.5763,1.3927,-1.0927,0.3995,-1.5358,0.9184,2.0496,...,1.1689,0.9989,1.554,1.3656,1.4299,0.8419,2.1225,1.1096,-0.4218,1.5061,1.0749,1.8351,0.4968,1.0799,0.6248,-0.1076,2.4359,2.0764,2.0007,3.6502,0.1443,2.3077,1.121,0.4837,2.2172,2.0787,0.0854,0.3697,0.9461,0.9194,1.0042,2.7258,3.2015,1.5483,2.0472,1.6936,-0.0731,-1.6099,0.9022,0.3975
A1CF,-4.3104,-6.9762,-6.0047,-2.7964,1.4124,-6.1761,-7.6493,-1.7065,-4.1164,-6.1236,-5.4997,-5.0364,-6.2681,-2.1363,0.8936,-3.2891,4.6606,-3.7933,-3.4446,3.381,-5.8411,-4.262,-4.7193,-6.8359,-5.5082,-6.2184,-4.9292,-5.1178,-6.483,-6.2573,-6.8635,-5.4965,-3.6586,-5.6003,-6.6572,-5.1801,-6.9669,-6.8933,-5.5681,-6.5907,...,-3.8503,-6.5325,-4.5958,-6.64,-5.4486,-6.3773,-3.6819,-2.6421,-5.2462,-6.756,-3.7972,-6.3695,-5.7699,-4.0819,-5.3216,-6.5505,-5.4528,-6.1668,-2.2473,-1.6288,-3.4595,-4.7548,-4.3275,-3.9888,2.9479,-4.2104,-5.937,-4.8136,-4.9846,-6.5644,-6.1147,-2.3639,-4.3047,-6.6266,-1.3841,-2.4416,-4.8975,-3.7423,-2.3708,-5.051
A2M,10.283,9.78,10.0746,11.1037,10.1339,9.2526,9.9767,10.8297,7.7219,10.6659,8.22,9.3291,8.6852,9.4124,10.7804,10.0078,9.149,9.0152,7.9316,8.7049,10.1049,10.1162,7.5333,7.9038,7.5614,7.7608,7.6639,8.8712,7.0693,7.6023,8.22,8.7249,7.1395,8.7063,8.2883,7.4633,7.1652,7.1328,10.0292,8.1975,...,10.7359,10.1068,9.7267,9.8094,9.0229,6.3728,7.9693,7.1119,6.3637,10.8858,7.7719,6.7333,6.4983,7.5106,7.8337,6.9946,7.7285,8.4274,9.0582,9.8243,9.179,9.2028,11.1617,10.0291,10.1678,7.7649,6.6188,6.4436,8.488,7.796,5.2123,10.2018,8.2129,8.3069,7.2597,7.9215,7.8997,9.5837,11.1142,9.226
A2M-AS1,-0.4035,-0.7095,1.208,-0.2786,-0.3216,-0.6842,-0.4894,-0.6612,1.3232,0.0663,-0.2903,0.249,0.1413,0.0721,-1.333,-0.5834,0.8852,0.1136,0.1161,-0.4684,-1.3175,1.4217,-0.1464,0.4215,0.6279,0.7474,-0.0713,1.8011,-0.0736,0.1177,0.8712,1.4693,0.5024,-0.9954,-0.9292,0.6012,0.8973,0.2462,0.3467,-1.2331,...,-0.0878,0.2619,0.5058,0.458,0.1856,1.3709,0.4428,1.3093,-0.9486,0.8513,1.222,0.4507,0.7999,0.6074,0.4772,0.6879,-0.3513,-0.1444,-0.9085,-1.1982,-0.3238,0.0353,0.0649,0.1971,0.4249,-1.4879,-0.2091,-1.1131,0.2249,1.2494,0.7054,-0.308,-0.9542,-1.6724,0.6538,-0.1474,0.6365,0.3054,-0.3004,-0.6586


In [15]:
ge_train.shape

(36926, 807)

In [16]:
ge_test.shape

(36926, 190)

In [17]:
mat = pd.concat([pfs_train, ge_train.transpose()], axis=1, sort=True).dropna()

# get concordance index and only use the top 100 genes

from tqdm import tqdm
from lifelines.utils import concordance_index

out = []

for i in tqdm(range(2, mat.shape[1]), total=mat.shape[1] - 2):
    out.append((mat.columns[i],
                concordance_index(mat.time_to_progression_or_censor,
                                  mat.iloc[:, i],
                                  mat.progression)))
    
cidx_df = pd.DataFrame(out)
cidx_df.columns = ["gene", "cidx"]
cidx_df["dev_from_random"] = (cidx_df.cidx - 0.5).abs()
cidx_df.sort_values("dev_from_random", ascending=False, inplace=True)

top_genes = cidx_df.iloc[:100].gene.values

100%|██████████| 36926/36926 [06:36<00:00, 93.08it/s]


In [18]:
x_ge_train = ge_train.loc[top_genes].transpose()
x_ge_test = ge_test.loc[top_genes].transpose()
x_ge_train.head() 

Unnamed: 0,SESN3,TESK2,NTRK2,SMARCA1,FMO4,FGFR2,HLF,KRT42P,RGMA,KRT15,CYP2S1,AKAP12,LINC00862,SEMA4A,CD9,TUBB3,DAPL1,UGGT2,DBP,GOLM1,GSTM2,STRIP2,ATP13A5,ITGA9-AS1,SDR42E1,OR7E28P,STAR,FMO6P,NKILA,PRSS12,GRHL3,TEF,ST6GALNAC2,PCDHA13,HTR1D,ITGB1-DT,CSTA,STARD5,PNMA1,ADAMTS6,...,JPH1,WSCD2,SLC52A1,APOBEC3C,RHOF,ABL2,CBX7,BCL11A,ASTE1,SUSD4,TRPV4,SERPINB13,COLCA1,MIR205HG,INKA1,LINC00885,ADH7,APOBEC3F,ADAMTS17,TM9SF1,LRP10,HARS2,CYB5R2,SLC9A9,COBLL1,FBXO4,PCDHA2,FAM222B,IRX6,SLC15A2,SYNPR-AS1,LNCTAM34A,PVR,PCDHA3,SLC1A4,NTF4,KLRG2,PCDHA5,LINC00240,BZW1
TRAIN_0001,7.6986,2.7103,1.1814,4.8155,2.5635,4.7445,5.8091,-1.7602,3.6904,6.9574,4.4667,3.2496,-4.3104,5.2575,7.3454,3.7533,-7.1178,4.3585,4.3726,9.3852,3.7394,0.6701,0.1396,0.8189,5.0105,-7.1178,-2.8699,-2.7255,0.3821,6.1695,1.9402,5.3249,2.6685,-0.4884,-3.0303,-3.4173,1.8393,1.6537,4.4611,-0.928,...,2.5529,-0.4596,1.7981,5.4814,1.433,5.9072,4.739,4.1425,3.023,4.2708,2.906,-4.7959,3.8616,1.2354,0.6438,-1.9885,-4.3104,1.8566,1.1062,3.267,8.2852,4.6599,3.6195,2.6152,5.4221,3.0205,-0.4596,6.0084,-0.4596,4.4667,0.1583,-1.6259,5.2532,-0.968,5.6315,0.0012,3.2756,-2.2598,0.2831,7.3543
TRAIN_0002,5.7339,1.9924,-0.4684,6.6989,1.6926,2.4437,6.1519,-4.1689,0.9721,4.6508,5.292,4.8696,-2.8888,5.0178,7.8213,5.2529,-2.1183,4.1211,3.0025,8.3084,2.4934,4.1639,-4.1689,-0.4684,4.1665,-6.9762,-3.2758,-3.5168,4.252,2.0817,1.1987,4.5081,5.0664,-1.3615,3.7845,2.9823,2.2012,0.6311,6.1302,0.9003,...,2.2012,-3.0694,0.6458,6.3836,3.4677,5.9632,3.9026,1.2086,3.232,1.1582,1.7753,-2.8888,-0.6364,3.3378,0.1633,-2.022,-1.9318,3.4985,0.6891,3.4719,8.467,5.1365,2.823,2.4729,4.8775,3.7961,-2.7283,6.4135,0.863,2.1086,-0.7864,-1.6187,7.7549,-1.6908,4.499,0.6748,1.4873,-3.2758,-1.7668,7.8156
TRAIN_0003,4.6815,3.0889,-0.6472,5.5111,1.8261,5.1153,1.6026,-2.9458,4.3729,6.5928,5.6967,4.2841,1.3082,5.7528,9.4865,3.1064,-2.6355,5.3453,4.2412,8.9082,3.8794,0.882,-2.3043,1.0139,4.1198,-5.2678,-4.7823,-5.2678,0.6911,3.4513,1.6748,5.1188,3.5024,-3.1974,1.9436,-0.9603,2.5201,1.8133,6.0168,1.0721,...,1.0933,2.4938,2.8355,6.35,2.8972,5.4903,4.7353,1.0863,3.3492,3.973,4.3385,-7.5897,2.9446,-0.2588,0.8738,-1.2146,-1.975,2.9754,-1.3999,3.2472,8.1029,5.1192,3.624,4.331,5.1363,3.2233,0.7897,5.776,-2.1634,4.5572,-7.5897,-0.6472,6.3064,1.8048,5.6714,1.3847,0.2241,2.2605,-0.1385,7.0397
TRAIN_0004,5.6656,2.448,3.5271,7.171,2.3102,3.9597,3.7342,-3.0859,1.8263,5.2524,3.4112,4.7049,1.8786,5.8313,8.8594,3.3272,-3.4485,3.7734,2.448,6.8181,4.1354,1.4095,0.5385,1.3216,4.2823,-6.2559,-3.9339,-3.9339,2.1148,4.5181,2.9511,4.2025,3.6037,1.5707,0.0479,-3.4485,1.9968,0.8422,4.3244,0.0109,...,4.3598,-0.423,-0.8296,5.1033,2.9705,5.5462,5.1509,1.9873,3.1386,4.435,1.6689,-6.2559,3.0365,-3.9339,1.3952,0.1535,1.2598,1.5579,0.1871,2.7581,8.3404,5.3739,3.0318,2.9215,6.2674,3.2618,-3.4485,5.4407,0.1535,5.2041,0.924,1.057,5.1751,-0.8983,4.4506,1.0203,1.4788,-1.8636,0.4584,7.5407
TRAIN_0005,6.8155,2.6298,-1.6884,6.396,2.8998,3.4409,3.5129,-5.9363,-0.1034,0.6029,1.3398,4.8654,-2.0294,6.668,8.1921,2.9796,-3.6144,5.0157,3.2411,10.3767,0.6029,4.5364,-1.4127,0.4731,-0.1549,-5.9363,-4.3513,-4.3513,-0.9821,4.4805,-0.9821,3.8823,0.7502,-3.6144,-2.0294,-2.7664,-0.807,1.2829,4.2695,0.8839,...,1.119,-1.0783,-3.6144,3.7899,2.2877,4.7103,4.3526,3.7449,3.0091,4.5056,-0.1549,-5.9363,4.6458,-3.1289,-0.1034,0.6336,-3.6144,0.8581,-1.6884,3.2949,7.947,4.2424,1.3021,2.4517,6.606,4.1709,-1.2924,5.7304,-1.8488,4.6345,2.0003,-2.0294,4.4742,-0.2084,4.8094,-0.4444,-4.3513,-3.6144,0.041,7.9028


#### Weighted correlation network analysis

Do a coexpression analysis on the RNAseq data to reduce colinearity in the modeling.

This has to be done in R. Then, I imported the processed RNAseq data back.

In [0]:
# load processed data back
x_ge_train_ica = pd.read_csv(DATA / "x_ge_train_new.csv", index_col=0)
x_ge_test_ica = pd.read_csv(DATA / "x_ge_test_new.csv", index_col=0)

In [20]:
x_ge_train_ica

Unnamed: 0,ME2,ME1,ME3,SMARCA1,FMO4,HLF,AKAP12,LINC00862,SEMA4A,TUBB3,UGGT2,DBP,GSTM2,STRIP2,ATP13A5,ITGA9.AS1,SDR42E1,NKILA,PRSS12,TEF,PCDHA13,HTR1D,ITGB1.DT,PNMA1,ADAMTS6,LINC00346,FAM83A.AS1,BLOC1S4,TP53I3,VAV2,INSYN1.AS1,PLEKHG2,LINC02598,GVQW2,DNAJC10,CTTNBP2,IBTK,FAM117A,FUBP1,JPH1,WSCD2,SLC52A1,RHOF,ABL2,CBX7,ASTE1,COLCA1,TM9SF1,LRP10,HARS2,SLC9A9,COBLL1,FBXO4,FAM222B,IRX6,SLC15A2,SYNPR.AS1,PVR,SLC1A4,KLRG2,LINC00240,BZW1
TRAIN_0001,0.024299,-0.027388,-0.028964,4.8155,2.5635,5.8091,3.2496,-4.3104,5.2575,3.7533,4.3585,4.3726,3.7394,0.6701,0.1396,0.8189,5.0105,0.3821,6.1695,5.3249,-0.4884,-3.0303,-3.4173,4.4611,-0.9280,2.0696,2.0294,3.2365,3.9191,6.1564,-7.1178,4.8163,-7.1178,-2.3629,6.8538,0.2131,6.4015,4.8476,6.6967,2.5529,-0.4596,1.7981,1.4330,5.9072,4.7390,3.0230,3.8616,3.2670,8.2852,4.6599,2.6152,5.4221,3.0205,6.0084,-0.4596,4.4667,0.1583,5.2532,5.6315,3.2756,0.2831,7.3543
TRAIN_0002,-0.003778,-0.037041,0.024696,6.6989,1.6926,6.1519,4.8696,-2.8888,5.0178,5.2529,4.1211,3.0025,2.4934,4.1639,-4.1689,-0.4684,4.1665,4.2520,2.0817,4.5081,-1.3615,3.7845,2.9823,6.1302,0.9003,3.9314,3.0336,4.7940,6.2048,5.9857,-6.9762,6.8538,-4.6543,-2.5839,7.4083,1.2185,5.8321,4.2447,6.6836,2.2012,-3.0694,0.6458,3.4677,5.9632,3.9026,3.2320,-0.6364,3.4719,8.4670,5.1365,2.4729,4.8775,3.7961,6.4135,0.8630,2.1086,-0.7864,7.7549,4.4990,1.4873,-1.7668,7.8156
TRAIN_0003,0.083562,-0.026094,0.012784,5.5111,1.8261,1.6026,4.2841,1.3082,5.7528,3.1064,5.3453,4.2412,3.8794,0.8820,-2.3043,1.0139,4.1198,0.6911,3.4513,5.1188,-3.1974,1.9436,-0.9603,6.0168,1.0721,2.5588,0.0898,4.1446,4.5887,6.1373,-6.0047,5.0616,-0.4298,-1.0819,7.9030,-0.2410,5.6456,3.9027,6.7921,1.0933,2.4938,2.8355,2.8972,5.4903,4.7353,3.3492,2.9446,3.2472,8.1029,5.1192,4.3310,5.1363,3.2233,5.7760,-2.1634,4.5572,-7.5897,6.3064,5.6714,0.2241,-0.1385,7.0397
TRAIN_0004,0.006362,-0.025608,-0.043096,7.1710,2.3102,3.7342,4.7049,1.8786,5.8313,3.3272,3.7734,2.4480,4.1354,1.4095,0.5385,1.3216,4.2823,2.1148,4.5181,4.2025,1.5707,0.0479,-3.4485,4.3244,0.0109,1.5057,0.8837,3.9789,5.8293,5.3676,-6.2559,6.7112,-4.6709,-2.0079,6.2497,1.4095,6.4795,4.1461,6.6350,4.3598,-0.4230,-0.8296,2.9705,5.5462,5.1509,3.1386,3.0365,2.7581,8.3404,5.3739,2.9215,6.2674,3.2618,5.4407,0.1535,5.2041,0.9240,5.1751,4.4506,1.4788,0.4584,7.5407
TRAIN_0005,0.014890,-0.052556,-0.084936,6.3960,2.8998,3.5129,4.8654,-2.0294,6.6680,2.9796,5.0157,3.2411,0.6029,4.5364,-1.4127,0.4731,-0.1549,-0.9821,4.4805,3.8823,-3.6144,-2.0294,-2.7664,4.2695,0.8839,2.2386,1.7572,4.2671,4.4994,7.0487,-5.9363,6.6016,-4.3513,-2.4769,7.8462,2.4858,7.5481,3.2511,6.6079,1.1190,-1.0783,-3.6144,2.2877,4.7103,4.3526,3.0091,4.6458,3.2949,7.9470,4.2424,2.4517,6.6060,4.1709,5.7304,-1.8488,4.6345,2.0003,4.4742,4.8094,-4.3513,0.0410,7.9028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRAIN_0811,-0.014569,0.052727,0.009399,5.9841,4.2622,5.0415,4.0602,-2.9270,5.7861,2.7417,4.7336,5.1742,4.7959,2.7604,1.8279,0.4045,5.6747,1.3108,5.2088,4.9536,-4.5119,-3.1334,-4.0265,4.8408,1.3108,0.5149,1.7548,2.9686,4.7402,4.9520,-5.2489,5.0190,-0.9031,0.0365,7.3418,4.6474,5.9258,3.8949,6.8740,5.2386,3.2037,3.0069,-0.0395,5.4883,3.7539,3.1675,2.0085,2.1057,7.4119,5.0112,4.9859,7.1226,3.7121,5.0291,2.3829,5.9506,-1.7046,6.0657,7.5995,2.7111,0.4422,7.2166
TRAIN_0812,0.006618,0.054555,0.051956,3.8740,5.0639,5.7669,4.1239,-1.7276,5.5588,-1.1249,3.9343,4.0381,5.3437,1.9055,2.9912,1.3817,-1.1971,1.3567,4.8646,5.8307,-4.8975,-3.6751,-4.1605,4.6843,0.0253,-0.8678,1.4423,2.8190,4.0654,4.1460,-6.4825,4.8490,-0.2537,1.5890,6.8818,4.0003,5.7460,4.3887,5.5333,5.4523,-1.5283,2.4274,-0.9279,4.3208,5.5101,3.8980,2.6949,2.6647,6.5968,4.7080,4.2343,6.0310,3.5876,4.7755,4.5721,5.9219,0.4833,4.5112,6.8744,2.1935,2.3087,6.8536
TRAIN_0813,0.010843,-0.017435,-0.050986,3.5914,4.2354,1.4850,6.4769,-2.6879,5.1800,3.3798,5.2527,3.0177,2.4261,1.9730,2.1848,0.9538,5.9405,1.5431,2.4074,3.5409,-3.0504,-0.5002,-2.4886,5.8830,1.1157,1.6839,-2.1574,3.4528,3.6334,5.4232,-5.8578,5.2000,-7.4428,0.6710,7.3892,-1.4204,6.1050,5.4139,7.6187,3.7514,-2.9192,-2.7989,0.5629,5.4216,2.9613,4.6569,5.3010,3.1930,6.7622,5.8509,0.4581,3.8069,2.0906,5.6472,1.7044,6.2508,-2.2333,4.6733,6.3938,1.5317,0.3054,7.3754
TRAIN_0814,0.026022,-0.018171,0.006374,5.8002,2.5329,3.4280,4.7523,-2.8182,6.1583,0.4638,4.4847,3.4349,4.4528,1.1735,-0.0109,1.6707,3.6307,2.1017,3.2400,4.6240,-0.7231,-1.5228,-3.9557,4.7958,1.7391,0.1653,-6.2777,3.4789,4.0386,5.9569,-4.6927,5.3621,-6.2777,-1.8853,7.3060,2.0666,5.8762,5.0616,6.2134,1.2846,-0.0109,0.0622,1.9367,5.0095,5.8141,3.2439,6.2314,3.2518,8.5221,4.5795,3.6575,5.5517,3.9500,4.7398,-2.5772,3.6924,-0.2116,4.9152,5.7149,-3.9557,-0.3950,6.2204


In [21]:
x_ge_test_ica

Unnamed: 0,ME2,ME1,ME3,SMARCA1,FMO4,HLF,AKAP12,LINC00862,SEMA4A,TUBB3,UGGT2,DBP,GSTM2,STRIP2,ATP13A5,ITGA9.AS1,SDR42E1,NKILA,PRSS12,TEF,PCDHA13,HTR1D,ITGB1.DT,PNMA1,ADAMTS6,LINC00346,FAM83A.AS1,BLOC1S4,TP53I3,VAV2,INSYN1.AS1,PLEKHG2,LINC02598,GVQW2,DNAJC10,CTTNBP2,IBTK,FAM117A,FUBP1,JPH1,WSCD2,SLC52A1,RHOF,ABL2,CBX7,ASTE1,COLCA1,TM9SF1,LRP10,HARS2,SLC9A9,COBLL1,FBXO4,FAM222B,IRX6,SLC15A2,SYNPR.AS1,PVR,SLC1A4,KLRG2,LINC00240,BZW1
TEST_0000,0.039042,0.001807,-0.027294,5.9157,2.4955,4.8642,2.5352,0.0431,6.2411,3.2963,3.5323,4.5753,3.3418,2.2457,-2.5980,-0.3321,4.3722,-0.6865,3.1884,4.4480,1.1946,0.1896,-3.8204,5.8460,-0.7615,1.3717,0.2289,4.5484,6.4411,5.3161,-6.9903,4.4207,-4.6684,-1.1076,7.5322,0.8615,5.7925,5.5535,6.8320,3.6454,-0.9679,0.3584,2.4794,5.1756,4.3678,3.4412,4.0216,3.9174,9.1013,4.6817,3.7436,7.3420,3.7058,5.8753,-1.7808,3.2683,1.0704,5.9915,6.2034,2.2361,-2.3464,7.5023
TEST_0001,0.084654,0.016846,-0.086417,7.2655,0.2529,4.4238,4.7097,-1.9265,6.0246,3.7625,4.2328,4.1697,2.7902,4.8150,1.6081,-0.2860,2.2101,2.4165,-0.3992,4.7418,-1.1560,-6.0139,-6.0139,6.3856,1.6514,3.0972,2.9432,4.7780,5.1301,5.4002,-4.4290,6.8446,-3.6920,-4.4290,6.8792,0.1358,5.7355,4.3535,6.3727,3.6917,-6.0139,1.7343,0.9973,5.1937,3.7426,2.5213,3.3012,4.8276,9.7910,5.0874,2.2004,4.5301,2.4819,5.7954,5.0952,0.6726,-0.6564,6.7399,5.9976,2.7037,0.0946,7.4083
TEST_0002,0.039175,0.003275,-0.000054,6.8537,3.5549,2.3625,5.9026,0.5532,3.6047,-0.0434,6.0232,4.1254,2.7620,3.9066,2.9418,1.3533,4.5871,2.1351,5.3152,4.7923,1.7040,2.4616,-0.1295,5.8085,0.1024,6.2932,-2.2450,3.0222,4.0972,5.8965,-6.1519,4.1802,-7.7368,0.1519,6.6429,-1.2290,6.0329,3.4854,7.0396,3.0568,-3.0930,0.0247,1.5963,4.5552,4.6865,3.9588,3.1738,3.3905,7.8584,5.1275,3.4173,3.2035,4.1599,4.0773,5.1702,5.1128,-2.1221,4.9781,6.7562,1.3048,0.4775,6.7271
TEST_0003,0.000632,0.012019,-0.022480,6.6299,2.7688,4.3905,4.3655,-3.3634,5.2285,4.0520,4.0760,4.0807,2.4329,1.4067,-2.2639,2.0042,3.4085,1.6301,1.1421,5.0478,-3.8488,-0.0210,-3.8488,4.9948,-0.1484,1.8008,1.4217,3.3547,5.2317,6.1452,-6.1708,5.5669,-2.4703,-1.9228,7.0627,1.7776,5.7275,4.4170,6.2127,3.7195,-1.3128,0.6236,2.8872,6.0667,4.5166,3.3151,3.6794,3.0795,8.3770,4.4937,2.4476,5.6821,3.8363,5.6984,-2.7113,4.4919,0.5974,5.5720,5.7015,0.6494,0.6747,6.9619
TEST_0004,0.051199,-0.007716,-0.045187,5.3040,1.7610,4.1097,2.2649,-5.0334,7.3613,2.2126,4.6880,3.8523,3.7020,4.9350,-3.8959,0.8878,4.7991,0.0455,1.3760,3.7707,-0.5352,-4.1854,-3.8959,4.7443,-0.1557,1.5242,-4.5480,4.6065,5.1802,6.2949,-0.6971,6.8002,-1.0155,-0.3441,7.1409,1.8949,5.9924,4.4330,6.5862,7.7854,-2.8318,-0.4850,1.8369,5.2683,5.1665,3.0593,4.5383,2.6291,7.7031,5.4478,4.7009,5.9334,3.8547,4.6555,-4.5480,4.9652,0.8096,5.8287,5.8603,0.8196,-0.0607,6.5933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TEST_0186,0.012503,0.011345,-0.059421,5.5936,2.2292,0.1957,4.4332,0.9178,4.4280,4.3663,5.9044,4.3288,3.9701,4.3359,1.2382,0.6382,3.7714,3.8885,1.7558,3.9336,-2.7678,2.9746,-0.6086,5.3341,0.6916,2.7326,3.5015,4.6801,6.1870,6.1048,-6.6747,5.6064,-6.6747,-1.6303,7.2204,-0.1989,5.5406,4.3594,6.7436,2.6359,-0.4848,1.7724,4.5595,5.1704,4.3975,2.6938,-1.2484,3.0344,7.7220,4.8425,1.6518,4.8494,4.3690,5.8667,-3.5047,5.3728,-6.6747,6.4033,4.7246,1.8606,0.4234,7.5070
TEST_0187,0.059970,-0.002144,-0.048656,5.0164,3.7751,1.0539,4.0363,-3.2483,5.2041,2.5521,2.8411,3.9988,7.8944,2.5846,-2.4598,0.7435,4.6416,1.5066,3.1204,4.2179,-3.9003,-2.6202,-1.3501,5.2205,-0.5992,0.6932,3.3462,3.4534,3.9568,6.5648,-6.7077,5.9449,0.6932,0.4522,5.9201,1.3312,5.7345,3.4957,6.3034,3.7892,4.1634,1.9180,0.4722,5.0443,4.2817,3.3811,4.5852,2.7332,7.6557,4.7817,2.3557,5.7642,3.9953,3.5579,-0.3327,4.2501,-1.4982,5.5148,5.6638,1.3202,2.4087,7.5606
TEST_0188,0.075967,-0.023680,-0.085571,6.3943,2.6133,0.7741,0.3053,-4.3386,5.9266,3.1126,2.8010,3.9606,3.6849,5.1204,-2.0166,1.8193,4.2183,0.0260,4.6185,3.6028,-6.6605,-3.8531,-4.3386,4.6255,-1.1059,0.5194,3.9403,4.2901,4.8056,4.6620,-6.6605,6.3960,-3.4906,-2.0166,6.2246,-4.3386,4.5605,4.7767,7.0855,3.9366,-5.0755,2.8689,1.5441,5.0464,2.3310,3.8122,-3.2011,3.3688,7.5955,5.1759,2.8806,6.2774,3.7113,4.8508,-3.8531,3.0451,-6.6605,5.2911,3.0927,3.6670,2.2613,7.2280
TEST_0189,0.032087,0.004350,0.008015,4.9423,2.1085,-2.3692,6.7733,-0.7093,6.1158,3.1656,3.6726,1.8666,4.1563,0.9353,-7.0130,2.2930,3.9989,1.4667,3.5095,1.7385,-0.0942,1.9324,-1.3983,4.9781,-0.5701,2.5511,1.6344,4.0510,5.6134,6.3107,-5.4281,5.5073,-4.6911,0.6090,7.5898,-5.4281,5.5083,3.5878,6.5296,2.4195,-4.2057,3.2170,3.8871,5.2221,4.6818,3.6585,1.5682,2.8151,7.4611,5.3293,0.2998,4.4127,4.8242,4.8091,0.1468,3.1854,-7.0130,6.4909,5.2441,1.5682,1.1619,7.4597


### Mutation data

In [22]:
mut_train = pd.read_csv(DATA / "mut_train.txt", sep="\t", low_memory=False)
mut_test = pd.read_csv(DATA / "mut_test.txt", sep="\t", low_memory=False)

mut_train.head()

Unnamed: 0,AA_MAF,AFR_MAF,ALLELE_NUM,AMR_MAF,ASN_MAF,Allele,Amino_acids,BAM_File,BIOTYPE,CANONICAL,CCDS,CDS_position,CLIN_SIG,CONTEXT,COSMIC,Center,Chromosome,Codons,Consequence,DISTANCE,DOMAINS,EAS_MAF,EA_MAF,ENSP,EUR_MAF,EXON,End_Position,Entrez_Gene_Id,ExAC_AF,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_Adj,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,Existing_variation,Exon_Number,FILTER,...,RefSeq,Reference_Allele,SAS_MAF,SIFT,SOMATIC,SWISSPROT,SYMBOL,SYMBOL_SOURCE,Score,Sequence_Source,Sequencer,Sequencing_Phase,Start_Position,Strand,TRANSCRIPT_STRAND,TREMBL,TSL,Transcript_ID,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,UNIPARC,VARIANT_CLASS,Validation_Method,Validation_Status,Variant_Classification,Variant_Type,Verification_Status,all_effects,cDNA_position,dbSNP_RS,dbSNP_Val_Status,n_alt_count,n_depth,n_ref_count,t_alt_count,t_depth,t_ref_count,patient_id
0,,,1,,,A,,,protein_coding,YES,CCDS37.1,-/1779,,CAGAAGCTAAG,,BI,chr1,,intron_variant,,,,,ENSP00000367830,,,2156152,5590,,,,,,,,,,,,PASS,...,NM_002744.4,G,,,,Q05513,PRKCZ,HGNC,,,Illumina HiSeq 2000,,2156152,+,1.0,,1.0,ENST00000378567,G,A,,,UPI0000169EB7,SNV,,,Intron,SNP,,"PRKCZ,intron_variant,,ENST00000378567,NM_00274...",-/2326,novel,,,24,,4,12,8,TRAIN_0693
1,,,1,,,A,F/L,,protein_coding,YES,CCDS122.1,678/1245,,CCATTCAGGGC,,BI,chr1,ttC/ttA,missense_variant,,Pfam_domain:PF00076;PROSITE_profiles:PS50102;S...,,,ENSP00000240185,,5/6,11020563,23435,,,,,,,,,,,5/6,PASS,...,NM_007375.3,C,,deleterious(0),,Q13148,TARDBP,HGNC,,,Illumina HiSeq 2000,,11020563,+,1.0,A0A024R4E2,1.0,ENST00000240185,C,A,,,UPI0000136B42,SNV,,,Missense_Mutation,SNP,,"TARDBP,missense_variant,p.F226L,ENST0000024018...",1034/5367,novel,,,136,,36,126,90,TRAIN_0693
2,,,1,,,T,S/I,,protein_coding,YES,CCDS30694.2,4157/10128,,GATCAGTCGTC,,BI,chr1,aGt/aTt,missense_variant,,,,,ENSP00000457168,,29/71,43430030,23334,8e-06,0.0,0.0,8e-06,0.0,0.0,1.5e-05,0.0,0.0,rs765781424,29/71,PASS,...,NM_015284.3,G,,tolerated(0.36),,Q5T011,SZT2,HGNC,,,Illumina HiSeq 2000,,43430030,+,1.0,,5.0,ENST00000562955,G,T,,,UPI0001E24F46,SNV,,,Missense_Mutation,SNP,,"SZT2,missense_variant,p.S1386I,ENST00000562955...",4157/12281,rs765781424,,,160,,43,179,136,TRAIN_0693
3,,,1,,,C,H,,protein_coding,YES,CCDS504.1,1287/1470,,GATCATGGAAG,,BI,chr1,caT/caC,synonymous_variant,,TIGRFAM_domain:TIGR00272,,,ENSP00000255108,,5/6,43972276,1802,,,,,,,,,,,5/6,PASS,...,NM_001384.4,T,,,,Q9BQC3,DPH2,HGNC,,,Illumina HiSeq 2000,,43972276,+,1.0,,1.0,ENST00000255108,T,C,,,UPI0000070CCB,SNV,,,Silent,SNP,,"DPH2,synonymous_variant,p.H429H,ENST0000025510...",1459/2472,novel,,,94,,31,107,76,TRAIN_0693
4,,0.0,1,0.0,,T,,,protein_coding,YES,CCDS551.1,-/1467,,TTTTTAAAAAA,,BI,chr1,,3_prime_UTR_variant,,,0.002,,ENSP00000360913,0.0,13/13,48298627,54558,,,,,,,,,,rs527462674,13/13,PASS,...,NM_019073.3,A,0.0,,,Q9NWH7,SPATA6,HGNC,,,Illumina HiSeq 2000,,48298627,+,-1.0,,2.0,ENST00000371847,A,T,,,UPI0000049C41,SNV,,,3'UTR,SNP,,"SPATA6,3_prime_UTR_variant,,ENST00000371847,NM...",1718/4973,rs527462674,by1000G;byFrequency,,31,,4,39,35,TRAIN_0693


In [0]:
# keep a record of the patients who exist in the mutation files
mut_pts_train = set(mut_train.patient_id)
mut_pts_test = set(mut_test.patient_id)

In [0]:
# pivot the mutation data
x_mut_train = mut_train[
    (mut_train.IMPACT.isin({"MODERATE", "HIGH"})) & (mut_train.FILTER == "PASS")
].pivot_table(index="patient_id", columns="Hugo_Symbol", values="t_alt_count")
# convert the values to 1 if there is a mutation, else fill in 0
x_mut_train[x_mut_train > 0] = 1
x_mut_train.fillna(0, inplace=True)

x_mut_test = mut_test[
    (mut_test.IMPACT.isin({"MODERATE", "HIGH"})) & (mut_test.FILTER == "PASS")
].pivot_table(index="patient_id", columns="Hugo_Symbol", values="t_alt_count")
# convert the values to 1 if there is a mutation, else fill in 0
x_mut_test[x_mut_test > 0] = 1
x_mut_test.fillna(0, inplace=True)

In [25]:
x_mut_train.head()

Hugo_Symbol,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,AADACL3,AADACL4,AADAT,AAED1,AAGAB,AAK1,AAMP,AANAT,AAR2,AARD,AARS,AARS2,AASDH,AASDHPPT,AASS,AATF,AATK,ABAT,ABCA1,ABCA10,ABCA12,ABCA13,ABCA2,ABCA3,ABCA4,ABCA5,ABCA6,ABCA7,ABCA8,...,ZRSR2,ZSCAN1,ZSCAN10,ZSCAN12,ZSCAN16,ZSCAN18,ZSCAN2,ZSCAN20,ZSCAN21,ZSCAN22,ZSCAN23,ZSCAN25,ZSCAN26,ZSCAN29,ZSCAN30,ZSCAN31,ZSCAN32,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSCAN9,ZSWIM1,ZSWIM2,ZSWIM3,ZSWIM4,ZSWIM5,ZSWIM6,ZSWIM8,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
TRAIN_0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAIN_0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
TRAIN_0003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAIN_0004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAIN_0005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
x_mut_train.shape

(731, 17586)

In [27]:
x_mut_test.shape

(175, 13008)

In [0]:
# keep the top 100 most frequently mutated genes

mut_freq = x_mut_train.sum(axis=0).sort_values(ascending=False)
top_mut_genes = mut_freq.index.values[:100]

In [0]:
x_mut_train_top = x_mut_train[top_mut_genes]
x_mut_test_top = x_mut_test[top_mut_genes]

### Combine all the data

In [0]:
# add suffix of `_mut` to all the genes in mut table to prevent duplicated gene names from mutation matrix and ge matrix

x_mut_train_top.columns = [x + "_mut" for x in x_mut_train_top.columns]
x_mut_test_top.columns = [x + "_mut" for x in x_mut_test_top.columns]

In [0]:
# combine all modalities
x_train = pd.concat([x_clin_train, x_ge_train_ica, x_mut_train_top], axis=1, sort=True)
x_test = pd.concat([x_clin_test, x_ge_test_ica, x_mut_test_top], axis=1, sort=True)

In [54]:
x_train.shape

(816, 186)

### Imputation

In [0]:
# simple median-value impute
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
x_train_imp = pd.DataFrame(imp_mean.fit_transform(x_train),
                           index=x_train.index, columns=x_train.columns)
x_test_imp = pd.DataFrame(imp_mean.fit_transform(x_test),
                          index=x_test.index, columns=x_test.columns)

In [0]:
# Use only common columns of the training and test set
common_columns = set(x_train_imp.columns) & set(x_test_imp.columns)
x_train_imp = x_train_imp[list(common_columns)]
x_test_imp = x_test_imp[list(common_columns)]

In [57]:
x_train_imp.head()

Unnamed: 0,COL12A1_mut_mut,NF1_mut_mut,PKHD1L1_mut_mut,MYH1_mut_mut,pathologic_T_T2,pathologic_T_TX,ABCA13_mut_mut,PEG3_mut_mut,FMO4,WSCD2,GSTM2,PCDH15_mut_mut,TEF,SLC9A9,DNAH8_mut_mut,FAT4_mut_mut,ITGA9.AS1,MUC16_mut_mut,FAM117A,ASPM_mut_mut,ZFHX4_mut_mut,pathologic_M_M1,CACNA1E_mut_mut,CNTNAP5_mut_mut,DNAJC10,ANKRD30A_mut_mut,BRINP3_mut_mut,CUBN_mut_mut,FAM135B_mut_mut,DNAH9_mut_mut,KMT2D_mut_mut,RP1L1_mut_mut,TM9SF1,COL6A3_mut_mut,LRP2_mut_mut,PXDNL_mut_mut,UGGT2,pathologic_T_T4,PAPPA2_mut_mut,PVR,...,ERICH3_mut_mut,ZNF804A_mut_mut,SSPO_mut_mut,CBX7,KLRG2,HMCN1_mut_mut,FBXO4,FAT1_mut_mut,SDR42E1,ASTN1_mut_mut,diagnosis_Lung Squamous Cell Carcinoma,APOB_mut_mut,MUC17_mut_mut,TUBB3,LRRC7_mut_mut,ZNF804B_mut_mut,TTN_mut_mut,SEMA4A,SI_mut_mut,DNAH11_mut_mut,pathologic_T_T2b,COL22A1_mut_mut,CRB1_mut_mut,AHNAK_mut_mut,RELN_mut_mut,TP53I3,CPS1_mut_mut,THSD7B_mut_mut,CNTNAP2_mut_mut,SMARCA1,ADGRB3_mut_mut,residual_tumor_R1,SYNPR.AS1,pathologic_T_T3,PKHD1_mut_mut,SORCS1_mut_mut,CTNNA2_mut_mut,ADAMTS6,LINC00240,FBN2_mut_mut
TRAIN_0000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.3281,-1.0485,3.7048,0.0,4.7846,3.2313,0.0,0.0,0.6779,0.0,4.2708,0.0,0.0,0.0,0.0,0.0,6.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.1285,0.0,0.0,0.0,4.1685,0.0,0.0,5.4583,...,0.0,0.0,0.0,4.4531,1.5588,0.0,3.6161,0.0,3.8576,0.0,0.0,0.0,0.0,2.0568,0.0,0.0,1.0,5.7897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.775,0.0,0.0,0.0,5.7323,0.0,0.0,-0.9309,0.0,0.0,0.0,0.0,0.4908,0.4716,0.0
TRAIN_0001,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.5635,-0.4596,3.7394,0.0,5.3249,2.6152,0.0,1.0,0.8189,1.0,4.8476,0.0,0.0,0.0,0.0,0.0,6.8538,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.267,0.0,0.0,0.0,4.3585,0.0,0.0,5.2532,...,0.0,0.0,0.0,4.739,3.2756,1.0,3.0205,0.0,5.0105,0.0,0.0,1.0,0.0,3.7533,0.0,0.0,1.0,5.2575,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.9191,0.0,0.0,0.0,4.8155,0.0,0.0,0.1583,0.0,0.0,0.0,0.0,-0.928,0.2831,0.0
TRAIN_0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6926,-3.0694,2.4934,1.0,4.5081,2.4729,0.0,0.0,-0.4684,1.0,4.2447,0.0,0.0,0.0,0.0,0.0,7.4083,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.4719,0.0,0.0,0.0,4.1211,0.0,0.0,7.7549,...,0.0,0.0,0.0,3.9026,1.4873,0.0,3.7961,0.0,4.1665,0.0,0.0,0.0,0.0,5.2529,0.0,1.0,0.0,5.0178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2048,0.0,0.0,0.0,6.6989,0.0,0.0,-0.7864,1.0,0.0,0.0,1.0,0.9003,-1.7668,1.0
TRAIN_0003,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.8261,2.4938,3.8794,1.0,5.1188,4.331,1.0,1.0,1.0139,1.0,3.9027,1.0,1.0,0.0,0.0,1.0,7.903,1.0,0.0,0.0,1.0,1.0,0.0,1.0,3.2472,1.0,0.0,1.0,5.3453,0.0,1.0,6.3064,...,1.0,1.0,0.0,4.7353,0.2241,1.0,3.2233,0.0,4.1198,1.0,0.0,1.0,1.0,3.1064,1.0,0.0,1.0,5.7528,1.0,1.0,0.0,1.0,1.0,0.0,1.0,4.5887,1.0,0.0,1.0,5.5111,1.0,0.0,-7.5897,0.0,1.0,1.0,0.0,1.0721,-0.1385,0.0
TRAIN_0004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.3102,-0.423,4.1354,0.0,4.2025,2.9215,0.0,0.0,1.3216,0.0,4.1461,0.0,0.0,0.0,0.0,0.0,6.2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7581,0.0,0.0,0.0,3.7734,0.0,1.0,5.1751,...,0.0,1.0,1.0,5.1509,1.4788,0.0,3.2618,0.0,4.2823,0.0,0.0,0.0,0.0,3.3272,0.0,0.0,0.0,5.8313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.8293,0.0,0.0,0.0,7.171,0.0,0.0,0.924,0.0,0.0,0.0,0.0,0.0109,0.4584,0.0


##Random Forest

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
def score_submission(df, pfs_test):

    df_ = df.merge(pfs_test, how="right")
    
    from lifelines.utils import concordance_index
    return concordance_index(df_.time_to_progression_or_censor,
                             df_.risk_score, df_.progression)

###Optimization of model parameters

In [0]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold

In [0]:
# kfold validation

def rf_cv_out(max_depth, min_samples_split, min_samples_leaf, max_features_ratio):

    if min_samples_split < min_samples_leaf:
        return -1

    rf_params = {
        'max_depth': int(max_depth),
        'min_samples_split': int(min_samples_split), 
        'min_samples_leaf': int(min_samples_leaf),
        'max_features': max_features_ratio
    }

    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    cv_scores = []
    for idx_train, idx_test in kf.split(x_train_imp, pfs_train):
        # train model
        rf = RandomForestRegressor(n_estimators=300, random_state=1, n_jobs=-1,
                                   **rf_params)
        rfmodel = rf.fit(x_train_imp.iloc[idx_train], 
                         pfs_train.time_to_progression_or_censor.values[idx_train],
                         sample_weight[idx_train])

        # test model
        fold_test_pred = -rfmodel.predict(x_train_imp.iloc[idx_test])
        fold_test_pred = pd.DataFrame({
            "patient_id": x_train_imp.iloc[idx_test].index.values,
            "risk_score": fold_test_pred
        })
        fold_test_target = pfs_train.loc[x_train_imp.iloc[idx_test].index.values]
        fold_test_target.reset_index(level=0, inplace=True)
        cv_scores.append(score_submission(fold_test_pred, fold_test_target))

    return -np.max(cv_scores)

In [62]:
# Bayesian optimization

weight = 0.5
sample_weight = np.where(pfs_train.progression == 1, 1, weight)

bo_param_bounds = {
    "max_depth": (1, 18),
    "min_samples_split": (2, 10),
    "min_samples_leaf": (1, 10),
    "max_features_ratio": (0.2, 1)}
    
BO = BayesianOptimization(rf_cv_out, bo_param_bounds)
BO.maximize(init_points=20, n_iter=80)

print(BO.max)

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.484   [0m | [0m 11.23   [0m | [0m 0.265   [0m | [0m 1.082   [0m | [0m 4.58    [0m |
| [95m 2       [0m | [95m-0.4797  [0m | [95m 5.174   [0m | [95m 0.917   [0m | [95m 2.541   [0m | [95m 8.295   [0m |
| [0m 3       [0m | [0m-1.0     [0m | [0m 4.497   [0m | [0m 0.4994  [0m | [0m 3.193   [0m | [0m 2.798   [0m |
| [95m 4       [0m | [95m-0.4756  [0m | [95m 11.49   [0m | [95m 0.7983  [0m | [95m 4.126   [0m | [95m 5.849   [0m |
| [0m 5       [0m | [0m-1.0     [0m | [0m 4.455   [0m | [0m 0.9295  [0m | [0m 8.279   [0m | [0m 2.372   [0m |
| [0m 6       [0m | [0m-1.0     [0m | [0m 8.994   [0m | [0m 0.8085  [0m | [0m 3.566   [0m | [0m 3.042   [0m |
| [0m 7       [0m | [0m-0.4781  [0m | [0m 4.559   [0m | [0m 0.7876  [0m | [0m 2.738   [0m | [0m 7

In [63]:
# choose weight

from sklearn.model_selection import KFold

kf = KFold(n_splits=5, random_state=1, shuffle=True)

for weight in np.linspace(0.1, 1, 10):
    print("weight:", weight)
    sample_weight = np.where(pfs_train.progression == 1, 1, weight)
    fold_scores = []
    for idx_train, idx_test in kf.split(x_train_imp, pfs_train):
        # train model for this fold
        rf = RandomForestRegressor(n_estimators=300, max_depth=2, 
                                min_samples_split=10, min_samples_leaf=6, 
                                max_features=0.2,
                                random_state=2, oob_score=True, n_jobs=-1)
        rfmodel = rf.fit(x_train_imp.iloc[idx_train], 
                        pfs_train.time_to_progression_or_censor.values[idx_train], 
                        sample_weight[idx_train])

        # test model
        fold_test_pred = -rfmodel.predict(x_train_imp.iloc[idx_test])
        fold_test_pred = pd.DataFrame({
            "patient_id": x_train_imp.iloc[idx_test].index.values,
            "risk_score": fold_test_pred
        })
        fold_test_target = pfs_train.loc[x_train_imp.iloc[idx_test].index.values]
        fold_test_target.reset_index(level=0, inplace=True)
        fold_scores.append(score_submission(fold_test_pred, fold_test_target))

    print('fold scores:', fold_scores)
    print('  => CV final score =', np.mean(fold_scores), '+/-', np.std(fold_scores))

weight: 0.1
fold scores: [0.3322422258592471, 0.3201951219512195, 0.4467052454732126, 0.37249231881438644, 0.3568513119533528]
  => CV final score = 0.36569724481028365 +/- 0.04444412493891907
weight: 0.2
fold scores: [0.32533187852336787, 0.31102439024390244, 0.4420384543587829, 0.36363636363636365, 0.3505344995140913]
  => CV final score = 0.35851311725530166 +/- 0.04565646006807883
weight: 0.30000000000000004
fold scores: [0.32860520094562645, 0.31258536585365854, 0.4403584095575882, 0.35532260979577085, 0.36239067055393587]
  => CV final score = 0.35985245134131605 +/- 0.044087956081695745
weight: 0.4
fold scores: [0.3347881432987816, 0.31102439024390244, 0.4405450812021654, 0.35875655159949393, 0.367444120505345]
  => CV final score = 0.3625116573699377 +/- 0.043697087215215515
weight: 0.5
fold scores: [0.3393344244408074, 0.3147317073170732, 0.4384916931118163, 0.36110609072835714, 0.36472303206997086]
  => CV final score = 0.363677389533605 +/- 0.04145634987044414
weight: 0.6
fo

In [0]:
del rf
del rfmodel

In [0]:
# use selected parameters to train the model

weight = 0.3
sample_weight = np.where(pfs_train.progression == 1, 1, weight)

rf_params = {
    'max_depth': 2,
    'min_samples_split': 10, 
    'min_samples_leaf': 6,
    'max_features': 0.2
}
rf = RandomForestRegressor(n_estimators=300,
                           random_state=1, oob_score=True, n_jobs=-1,
                           **rf_params)

rfmodel = rf.fit(x_train_imp, pfs_train.time_to_progression_or_censor.values, sample_weight)

In [66]:
rfmodel.feature_importances_

array([0.        , 0.00079443, 0.        , 0.        , 0.00295843,
       0.        , 0.        , 0.        , 0.01767519, 0.02375813,
       0.00456397, 0.00069825, 0.01138065, 0.01475188, 0.        ,
       0.        , 0.01381711, 0.        , 0.00800993, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00628933,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.0091871 , 0.0008206 , 0.        ,
       0.        , 0.01944004, 0.        , 0.        , 0.01746703,
       0.00042821, 0.        , 0.02418615, 0.00273806, 0.01547957,
       0.00121644, 0.        , 0.        , 0.        , 0.01566409,
       0.        , 0.04435244, 0.        , 0.04792015, 0.        ,
       0.00750811, 0.        , 0.        , 0.01504576, 0.        ,
       0.03314603, 0.00100759, 0.00381052, 0.        , 0.        ,
       0.00711649, 0.00957548, 0.04061573, 0.        , 0.00445907,
       0.00699041, 0.        , 0.02107452, 0.        , 0.     

In [67]:
# Use the forest's predict method on the test data
pred = -rfmodel.predict(x_test_imp)
print(pred[:10])

[-795.36710433 -634.36800865 -734.61150457 -695.66593854 -770.6791034
 -649.67888865 -695.74220434 -617.54619222 -637.76481039 -631.34114941]


In [68]:
pred_df = pd.DataFrame({
    "patient_id": x_test_imp.index.values,
    "risk_score": pred
})

pred_df.head(10)

Unnamed: 0,patient_id,risk_score
0,TEST_0000,-795.367104
1,TEST_0001,-634.368009
2,TEST_0002,-734.611505
3,TEST_0003,-695.665939
4,TEST_0004,-770.679103
5,TEST_0005,-649.678889
6,TEST_0006,-695.742204
7,TEST_0007,-617.546192
8,TEST_0008,-637.76481
9,TEST_0009,-631.341149
