In [None]:
!python --version

Python 3.7.3


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib, matplotlib.pyplot as plt

In [None]:
def discretize(df, bins=200):
    print(f"Discretizing features into {bins} bins")
    result = df.copy()
    for feature_name in df.columns:
        result[feature_name] = pd.cut(df[feature_name], bins=bins, labels=range(bins), include_lowest=True)
        # result[feature_name], _ = pd.factorize(cut_result)
    return result


def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        if max_value != min_value:
          result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def load_data(task = 'PAM50', subset='all', discrete=False, bins=200):
    if subset == 'all':
        path = "/datasets/mbdata/MBdata_all.csv"
    elif subset == 'original':
        path = "./MBdata_original.csv"
    df = pd.read_csv(path)
    if task == 'DR':
        df = df[df.DR != '?']
        target = df.pop('DR')
    elif task == 'ER':
        df = df[df.ER_Status != '?']
        target = df.pop('ER_Status')
        labels = {
            'pos': 0,
            'neg': 1
        }
        target = target.apply(lambda x: labels[x])
    elif task == 'iC10':
        df = df[df.iC10 != '?']
        target = df.pop('iC10')
        labels = {
            '4ER-': 4,
            '4ER+': 0
        }
        target = target.apply(lambda x: labels[x] if x in labels else int(x))
    elif task == 'PAM50':
        df = df[df.Pam50Subtype != '?']
        target = df.pop('Pam50Subtype')
        pam50_lables = {
            'Normal': 0,
            'LumA': 1,
            'LumB': 2,
            'Basal': 3,
            'Her2': 4
        }
        target = target.apply(lambda x: pam50_lables[x])


    features = df.filter(regex='^GE.*')
    features = features.astype('float64')
    # print(features.shape)

    if discrete:
        features = discretize(features, bins)
    else:
        features = normalize(features)
        features.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    return features, target

## Experiments

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold, train_test_split
import random

import warnings
warnings.filterwarnings('ignore')

In [None]:
features, target = load_data(task='DR', subset='original', discrete=True, bins=200)

Discretizing features into 200 bins


In [None]:
import sklearn.svm
for bins in [200,100,10,20]:
    features, target = load_data(task='DR', subset='original', discrete=True, bins=bins)
    acc = []
    kf = StratifiedKFold(n_splits=5, shuffle=True)

    x_train = features.values[:]
    y_train = target.values[:]

    for C in [0.01, 0.1, 1, 100]:
        for train, valid in kf.split(x_train, y_train):
            svm = sklearn.svm.SVC(kernel='rbf', C=C)
            svm.fit(x_train[train], y_train[train])
            y_pred=svm.predict(x_train[valid])
            acc.append(metrics.accuracy_score(y_train[valid], y_pred))
            print(f"C={C}: {acc[-1]:.3f}")

        print(f"C={C}: {np.mean(acc):.3f}+-{np.std(acc):.3f}")

Discretizing features into 200 bins
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.694
C=0.01: 0.694
C=0.01: 0.696+-0.001
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.694
C=0.1: 0.694
C=0.1: 0.696+-0.001
C=1: 0.699
C=1: 0.692
C=1: 0.697
C=1: 0.694
C=1: 0.692
C=1: 0.696+-0.002
C=100: 0.677
C=100: 0.672
C=100: 0.692
C=100: 0.679
C=100: 0.659
C=100: 0.691+-0.010
Discretizing features into 100 bins
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.694
C=0.01: 0.694
C=0.01: 0.696+-0.001
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.694
C=0.1: 0.694
C=0.1: 0.696+-0.001
C=1: 0.702
C=1: 0.694
C=1: 0.694
C=1: 0.697
C=1: 0.697
C=1: 0.696+-0.002
C=100: 0.664
C=100: 0.634
C=100: 0.682
C=100: 0.689
C=100: 0.657
C=100: 0.689+-0.017
Discretizing features into 10 bins
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.697
C=0.01: 0.694
C=0.01: 0.694
C=0.01: 0.696+-0.001
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.697
C=0.1: 0.694
C=0.1: 0.694
C=0.1: 0.696+-0.001
C=1: 0.692
C=1: 0.697
C=1: 0.697
C=1: 0.689
C=1: 

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
for bins in [200,100,10,20]:
    features, target = load_data(task='DR', subset='original', discrete=True, bins=bins)
    acc_all = []
    kf = StratifiedKFold(n_splits=5, shuffle=True)

    x_train = features.values[:]
    y_train = target.values[:]

    for train, valid in kf.split(x_train, y_train):
        params = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}
        classifier = GridSearchCV(SVC(gamma="scale"), params, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)
            
        classifier.fit(x_train[train], y_train[train])
        Y_pred = classifier.predict(x_train[valid])
        acc = metrics.accuracy_score(y_train[valid], Y_pred)
        acc_all.append(acc)
        print(f"{acc:.3f}")

    print(f"{np.mean(acc):.3f}+-{np.std(acc):.3f}")

Discretizing features into 200 bins
0.689
0.697
0.697
0.697
0.694
0.694+-0.000
Discretizing features into 100 bins
0.697
0.697
0.697
0.694
0.694
0.694+-0.000
Discretizing features into 10 bins
0.699
0.697
0.697
0.679
0.694
0.694+-0.000
Discretizing features into 20 bins
0.702
0.697
0.692
0.694
0.699
0.699+-0.000


In [None]:
import sklearn.svm
features, target = load_data(task='PAM50', subset='original', discrete=False, bins=0)
acc = []
kf = StratifiedKFold(n_splits=5, shuffle=True)

x_train = features.values[:]
y_train = target.values[:]

for train, valid in kf.split(x_train, y_train):
    rf = RandomForestClassifier()        
    rf.fit(x_train[train], y_train[train])
    y_pred=rf.predict(x_train[valid])
    acc.append(metrics.accuracy_score(y_train[valid], y_pred))
    print(f" {acc[-1]:.3f}")

print(f"{np.mean(acc):.3f}+-{np.std(acc):.3f}")

 0.744
 0.754
 0.722
 0.734
 0.744
0.740+-0.011


In [None]:
# for bins in [200,100,10,20]:
features, target = load_data(task='PAM50', subset='original', discrete=False, bins=bins)
acc = []


for seed in range(5):
    X_train, X_test, y_train, y_test = train_test_split(features, 
                                                        target.to_numpy(), 
                                                        stratify=target.to_numpy(),
                                                        train_size=1500,
                                                        test_size=400,
                                                        shuffle=True,
                                                        random_state=seed
                                                        )
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred=rf.predict(X_test)
    score = metrics.accuracy_score(y_pred, y_test)
    print(score)
    acc.append(score)

print(f"{np.mean(acc):.3f}+-{np.std(acc):.3f}")

0.69
0.6775
0.6975
0.7
0.675
0.688+-0.010
0.685
0.6925
0.6875
0.6975
0.6975
0.692+-0.005
0.7025
0.6925
0.6925
0.685
0.6925
0.693+-0.006
0.685
0.685
0.6875
0.7025
0.6925
0.691+-0.007


In [None]:
for bins in [200,100,10,20]:
    features, target = load_data(task='DR', subset='original', discrete=True, bins=bins)
    acc = []
    features_local = features.copy().astype('float64')

    # n=1000
    # embeddings='dl2vec'
    # cos_distances_sorted = load_gene_embeddings(name=embeddings, reference_gene='FOXA1')
    # top_genes = ['GE_'+g for g,v in cos_distances_sorted.items() if 'GE_'+g in list(features.columns)]
    # features_local = features_local.filter(top_genes[:n]) #features_o.columns)

    for seed in range(5):
        X_train, X_test, y_train, y_test = train_test_split(features_local.to_numpy(), 
                                                            target.to_numpy(), 
                                                            stratify=target.to_numpy(),
                                                            train_size=1000,
                                                            test_size=400,
                                                            shuffle=True,
                                                            random_state=seed
        )
        # y_train = target.values[:]
        b = np.zeros((y_train.size, 11))
        b[np.arange(y_train.size),y_train] = 1
        y_train = b

        b = np.zeros((y_test.size, 11))
        b[np.arange(y_test.size),y_test] = 1
        y_test = b

        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            # tf.keras.layers.Dropout(0.2),
            # tf.keras.layers.Dense(256, activation='relu'),
            # tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),# use_bias=False),
            #tf.keras.layers.BatchNormalization(),
            # tf.keras.layers.Activation("relu"),
            tf.keras.layers.Dense(11, activation='softmax')
        ])

        adam = tf.keras.optimizers.Adam(learning_rate=0.0001)
        model.compile(optimizer=adam,
                        loss='categorical_crossentropy',
                        metrics=['accuracy'])


        model.fit(X_train, y_train, epochs=50,  verbose=0)
        model.evaluate(X_test, y_test, verbose=2)
        y_pred=model.predict(X_test)
        y_pred=np.argmax(y_pred, axis=1)
        score = metrics.accuracy_score(np.argmax(y_test, axis=1), y_pred)
        acc.append(score)

    print(f"{np.mean(acc):.3f}+-{np.std(acc):.3f}")

Discretizing features into 200 bins
400/400 - 0s - loss: 4.8633 - accuracy: 0.5925
400/400 - 0s - loss: 3.0973 - accuracy: 0.6100
400/400 - 0s - loss: 5.3467 - accuracy: 0.5900
400/400 - 0s - loss: 8.7673 - accuracy: 0.6000
400/400 - 0s - loss: 3.8607 - accuracy: 0.5950
0.597+-0.007
Discretizing features into 100 bins
400/400 - 0s - loss: 2.2859 - accuracy: 0.6100
400/400 - 0s - loss: 4.0681 - accuracy: 0.6225
400/400 - 0s - loss: 2.4775 - accuracy: 0.6300
400/400 - 0s - loss: 2.4045 - accuracy: 0.6325
400/400 - 0s - loss: 2.3962 - accuracy: 0.6150
0.622+-0.009
Discretizing features into 10 bins
400/400 - 0s - loss: 1.2557 - accuracy: 0.6400
400/400 - 0s - loss: 1.1428 - accuracy: 0.6275
400/400 - 0s - loss: 1.1503 - accuracy: 0.6700
400/400 - 0s - loss: 1.1213 - accuracy: 0.6825
400/400 - 0s - loss: 1.0796 - accuracy: 0.7075
0.665+-0.029
Discretizing features into 20 bins
400/400 - 0s - loss: 1.2637 - accuracy: 0.6625
400/400 - 0s - loss: 1.2577 - accuracy: 0.6400
400/400 - 0s - loss: