# Read in the Encodings to a Pandas DataFrame

In [1]:
import pandas as pd

encodings = pd.read_csv("binary_encodings.csv")

In [2]:
encodings.head(20)

Unnamed: 0.1,Unnamed: 0,DonorIDs,CancerType,MU1899169,MU1957569,MU1957631,MU1957694,MU1957756,MU1957895,MU1957974,...,MU130696800,MU122201,MU129795540,MU129540995,MU4885648,MU4468,MU866,MU62030,MU131898417,MU131867962
0,0,DO48566BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,DO223588BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,DO51948BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,DO514BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,DO474BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,DO481BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,DO48576BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,DO219068BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,DO51906BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,DO48611BLCA-US,BLCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Construct a Dictionary for Mapping CancerTypes to Integer Labels

In [3]:
cancer_types = set(list(encodings['CancerType']))

In [4]:
cancer_types

{'BLCA-US',
 'BRCA-US',
 'COAD-US',
 'GBM-US',
 'KIRC-US',
 'LGG-US',
 'LUSC-US',
 'OV-US',
 'PRAD-US',
 'SKCM-US',
 'THCA-US',
 'UCEC-US'}

In [5]:
label_dict = {}
for i, ct in enumerate(cancer_types):
    label_dict[ct] = i
label_dict

{'KIRC-US': 0,
 'BRCA-US': 1,
 'PRAD-US': 2,
 'BLCA-US': 3,
 'COAD-US': 4,
 'LGG-US': 5,
 'THCA-US': 6,
 'SKCM-US': 7,
 'LUSC-US': 8,
 'GBM-US': 9,
 'UCEC-US': 10,
 'OV-US': 11}

In [6]:
mutation_labels = [label for label in list(encodings.columns) if label.startswith("MU")]

In [7]:
mutations = encodings[mutation_labels].to_numpy().tolist()

In [8]:
len(mutations[0])

20264

# Helper Function for Converting Labels to One-hot Encoding

In [9]:
import numpy as np
def to_onehot(label, nclass=12):
    result = np.zeros(nclass)
    result[label] = 1
    return result

# Construct a Labelled Dataset

In [10]:
label_list = [to_onehot(label_dict[cancer_type]) for cancer_type in list(encodings["CancerType"])]
data = list(zip(mutations, label_list))

In [12]:
np.random.shuffle(data)

# Define a Simple Dense Classifier

In [24]:
from tensorflow.keras.layers import Dense, Activation, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

X = Input((20264,))
H1 = Dense(500, name="Dense1")(X)
Z1 = Activation("relu", name="Activation1")(H1)
H2 = Dense(50, name="Dense2")(Z1)
Z2 = Activation("relu", name="Activation2")(H2)
Y = Dense(12, activation="softmax")(Z2)

classifier = Model(X, Y)
classifier.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 20264)             0         
_________________________________________________________________
Dense1 (Dense)               (None, 500)               10132500  
_________________________________________________________________
Activation1 (Activation)     (None, 500)               0         
_________________________________________________________________
Dense2 (Dense)               (None, 50)                25050     
_________________________________________________________________
Activation2 (Activation)     (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 12)                612       
Total params: 10,158,162
Trainable params: 10,158,162
Non-trainable params: 0
________________________________________________________________

# Separate Data into Training and Test Sets and Fit the Model

In [15]:
test = data[:500]
train = data[500:]

In [19]:
datax = np.array([datum for (datum, _) in train])
datay = np.array([label for (_, label) in train])
print(datax.shape)
print(datay.shape)

(2864, 20264)
(2864, 12)


In [28]:
classifier.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.00003), metrics=["accuracy"])
classifier.fit(x=datax, y=datay, batch_size=32, epochs=30, validation_split=0.2)

Train on 2291 samples, validate on 573 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f78d43d0050>

# The Model Fits Poorly, and Isn't Generalizing
## This is Likely due to the Sparsity of the Inputs
## To Remedy this, Attempt Dimensionality Reduction

In [29]:
from sklearn.decomposition import TruncatedSVD

In [36]:
mutation_counts = ([sum(mutations) for (mutations,_) in data])
mutation_counts.sort(reverse=True)
mutation_counts

[3011.0,
 2371.0,
 2302.0,
 1826.0,
 1608.0,
 1486.0,
 1354.0,
 1316.0,
 1212.0,
 1094.0,
 1058.0,
 912.0,
 858.0,
 857.0,
 847.0,
 833.0,
 805.0,
 797.0,
 794.0,
 793.0,
 781.0,
 739.0,
 724.0,
 707.0,
 701.0,
 636.0,
 616.0,
 612.0,
 595.0,
 507.0,
 504.0,
 500.0,
 493.0,
 467.0,
 424.0,
 405.0,
 379.0,
 377.0,
 373.0,
 360.0,
 335.0,
 334.0,
 325.0,
 320.0,
 315.0,
 309.0,
 309.0,
 301.0,
 286.0,
 268.0,
 255.0,
 249.0,
 238.0,
 232.0,
 228.0,
 225.0,
 222.0,
 219.0,
 215.0,
 201.0,
 200.0,
 196.0,
 193.0,
 193.0,
 192.0,
 192.0,
 191.0,
 183.0,
 180.0,
 172.0,
 161.0,
 150.0,
 149.0,
 147.0,
 133.0,
 132.0,
 131.0,
 127.0,
 118.0,
 115.0,
 115.0,
 114.0,
 113.0,
 112.0,
 111.0,
 110.0,
 99.0,
 97.0,
 95.0,
 88.0,
 82.0,
 79.0,
 79.0,
 78.0,
 78.0,
 78.0,
 77.0,
 77.0,
 76.0,
 76.0,
 74.0,
 72.0,
 72.0,
 70.0,
 69.0,
 69.0,
 69.0,
 68.0,
 68.0,
 67.0,
 67.0,
 66.0,
 66.0,
 66.0,
 65.0,
 65.0,
 65.0,
 65.0,
 64.0,
 64.0,
 64.0,
 63.0,
 63.0,
 63.0,
 63.0,
 63.0,
 63.0,
 62.0,
 61.0,


In [50]:
len([count for count in mutation_counts if count <= 5])

2736

In [37]:
reducer = TruncatedSVD(n_components=600, n_iter=20)
datax_reduced = reducer.fit_transform(datax)

In [38]:
datax_reduced.shape

(2864, 600)

array([-1.06321907e-16,  7.08110458e-15, -1.37715983e-14, -5.77786077e-15,
        8.50694020e-17, -6.34750459e-15,  6.49787539e-15, -5.18748666e-16,
        7.10284538e-16, -3.63236293e-15, -3.58220698e-17, -4.67911555e-15,
        2.91682588e-15, -8.68601461e-15, -7.80696517e-16, -2.16019127e-15,
        3.04772482e-16, -3.75592762e-15, -6.98088233e-16,  6.21993337e-15,
        1.23318062e-15,  3.40893183e-15, -2.99815846e-15, -2.23574201e-15,
        1.09566995e-15, -1.10762702e-15,  1.15703131e-15,  8.34078571e-17,
       -1.84128801e-15,  1.25641925e-15, -4.95288763e-17, -7.35997545e-16,
        2.53295538e-16,  4.18781484e-16, -1.06461956e-16, -3.79948705e-16,
       -8.39293917e-17, -3.84444637e-16,  1.08984548e-16,  1.42754163e-16,
       -6.25866551e-17,  1.18342096e-17, -2.65929669e-16, -1.55000128e-16,
        1.72385067e-17,  8.71517736e-17, -7.90493852e-17,  2.95088835e-17,
       -7.04557606e-18,  1.61771629e-18, -1.47622129e-16, -7.12296566e-17,
        8.66383020e-17, -

In [54]:
X = Input((600,))
H1 = Dense(300, name="Dense1")(X)
Z1 = Activation("relu", name="Activation1")(H1)
H2 = Dense(50, name="Dense2")(Z1)
Z2 = Activation("relu", name="Activation2")(H2)
Y = Dense(12, activation="softmax")(Z2)

classifier2 = Model(X, Y)
classifier2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 600)               0         
_________________________________________________________________
Dense1 (Dense)               (None, 300)               180300    
_________________________________________________________________
Activation1 (Activation)     (None, 300)               0         
_________________________________________________________________
Dense2 (Dense)               (None, 50)                15050     
_________________________________________________________________
Activation2 (Activation)     (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 12)                612       
Total params: 195,962
Trainable params: 195,962
Non-trainable params: 0
_________________________________________________________________


In [55]:
classifier2.compile(optimizer=Adam(lr=0.0003), loss="categorical_crossentropy", metrics=["accuracy"])
classifier2.fit(datax_reduced, datay, batch_size=32, epochs=100, validation_split=0.2)

Train on 2291 samples, validate on 573 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f765c334e90>

# The Model Trained on Reduced Inputs is Actually Worse...
## We Can Attempt a Different Encoding Scheme

In [76]:
def transform_encoding(mutations, d=3011):
    result = []
    for j, b in enumerate(mutations):
        if b == 1:
            result.append(j/len(mutations))
    while len(result) < 3011:
        result.append(-1)
    return result

In [77]:
data_transformed = [(transform_encoding(m),l) for (m,l) in data]

In [80]:
X = Input((3011,))
H1 = Dense(300, name="Dense1")(X)
Z1 = Activation("relu", name="Activation1")(H1)
H2 = Dense(50, name="Dense2")(Z1)
Z2 = Activation("relu", name="Activation2")(H2)
Y = Dense(12, activation="softmax")(Z2)

classifier3 = Model(X, Y)
classifier3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 3011)              0         
_________________________________________________________________
Dense1 (Dense)               (None, 300)               903600    
_________________________________________________________________
Activation1 (Activation)     (None, 300)               0         
_________________________________________________________________
Dense2 (Dense)               (None, 50)                15050     
_________________________________________________________________
Activation2 (Activation)     (None, 50)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 12)                612       
Total params: 919,262
Trainable params: 919,262
Non-trainable params: 0
_________________________________________________________________


In [81]:
trans_test = data_transformed[:500]
trans_train = data_transformed[500:]

xtrain = np.array([np.array(x) for (x,y) in trans_train])
ytrain = np.array([y for (x,y) in trans_train])
classifier3.compile(optimizer=Adam(lr=0.0003), loss="categorical_crossentropy", metrics=["accuracy"])
classifier3.fit(xtrain, ytrain, batch_size=32, epochs=150, validation_split=0.2)

Train on 2291 samples, validate on 573 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150


Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150


Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x7f765b125910>