In [1]:
# Nicholas Nuechterlein
import sys
sys.path.append('../')

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook as tqdm
from sklearn.decomposition import PCA

from sklearn.metrics import roc_curve, auc, roc_auc_score

%load_ext autoreload
%autoreload 2

# Genomic Survival Predictions

## Data

In [2]:
metadata_df = pd.read_csv('../data/all_glioma_metadata_542x30.csv', index_col=0)
metadata_df = metadata_df.set_index('tciaID')
train_idx = metadata_df[metadata_df['phase'] == 'train'].index
val_idx = metadata_df[metadata_df['phase'] == 'val'].index

cna_full_df = pd.read_csv('../data/gistic-subtype-OS-1087-21546.csv', index_col=0)
survival_df = cna_full_df['OS']
cna_full_df = cna_full_df.drop(columns=['OS', 'subtype'])

# Survival predictions using extra/unlabeled data

In [3]:
from tqdm import tqdm_notebook as tqdm
from sklearn.decomposition import PCA

X_train = cna_full_df[~cna_full_df.index.isin(val_idx)]
X_val = cna_full_df[cna_full_df.index.isin(val_idx)]

print(X_train.shape[0], 'training samples and', X_val.shape[0], 'validation samples')

# loop through PCA projections with different numbers of components in [2, 50]
for n_components in tqdm([i for i in range(2, 50)]):
    # PCA on the entire dataset (1087 - 75 (val)) | of the 
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_valid_pca = pca.transform(X_val)
    
    # make X_train back into a dataframe to make sure the labels line up with the samples
    pca_columns = ['pca'+str(i) for i in range(n_components)]
    X_train_pca_df = pd.DataFrame(data=X_train_pca, columns=pca_columns, index=X_train.index)
    X_train_pca_df = X_train_pca_df[X_train_pca_df.index.isin(survival_df.index)]
    train_pca_data_df = pd.concat([survival_df, X_train_pca_df], axis=1, join='inner').dropna()
    
    # do the same with the validation data
    X_val_pca_df = pd.DataFrame(data=X_valid_pca, columns=pca_columns, index=X_val.index)
    val_pca_data_df = pd.concat([survival_df, X_val_pca_df], axis=1, join='inner')
    
    ###########################################
    ###### training and valiation data! #######
    ###########################################
    X_train_pca = train_pca_data_df[pca_columns]
    y_os_train = train_pca_data_df['OS']
    
    X_val_pca = val_pca_data_df[pca_columns]
    y_os_val = val_pca_data_df['OS']
    ############################################

1012 training samples and 75 validation samples


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




## Genomic predictions without unlabled data

In [4]:
X_train = cna_full_df[cna_full_df.index.isin(train_idx)]
X_val = cna_full_df[cna_full_df.index.isin(val_idx)]

print(X_train.shape[0], 'training samples and', X_val.shape[0], 'validation samples')

# loop through PCA projections with different numbers of components in [2, 50]
for n_components in tqdm([i for i in range(2, 50)]):
    # PCA on the entire dataset (1087 - 75 (val)) | of the 
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_valid_pca = pca.transform(X_val)
    
    # make X_train back into a dataframe to make sure the labels line up with the samples
    pca_columns = ['pca'+str(i) for i in range(n_components)]
    X_train_pca_df = pd.DataFrame(data=X_train_pca, columns=pca_columns, index=X_train.index)
    X_train_pca_df = X_train_pca_df[X_train_pca_df.index.isin(survival_df.index)]
    train_pca_data_df = pd.concat([survival_df, X_train_pca_df], axis=1, join='inner')
    
    # do the same with the validation data
    X_val_pca_df = pd.DataFrame(data=X_valid_pca, columns=pca_columns, index=X_val.index)
    val_pca_data_df = pd.concat([survival_df, X_val_pca_df], axis=1, join='inner')
    
    ###########################################
    ###### training and valiation data! #######
    ###########################################
    X_train_pca = train_pca_data_df[pca_columns]
    y_os_train = train_pca_data_df['OS']
    
    X_val_pca = val_pca_data_df[pca_columns]
    y_os_val = val_pca_data_df['OS']
    ############################################

160 training samples and 75 validation samples


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




# Genomic IDH predictions

## Functions

In [5]:
def class_accuracies(preds, labels, probs=None, classes=['wildtype', 'oligo', 'mutant'], verbose=True):
    preds = np.asarray(preds)
    labels = np.asarray(labels)
    probs = np.asarray(probs)
    
    class_acc_list = []
    for i, class_name in enumerate(classes):
        class_labels = labels[labels == i]
        class_preds = preds[labels == i]
        class_acc = np.sum(class_labels == class_preds)/class_labels.shape[0]
        class_acc_list.append(class_acc)
        if verbose:
            print(class_name, 'acc:\t', class_acc)
    accuracy = np.sum(preds==labels)/labels.shape[0]
    average_acc = np.mean(class_acc_list)
    if verbose:
        print('Average acc:\t\t', np.mean(class_acc_list))
        print('Overall acc:\t\t', np.sum(preds==labels)/labels.shape[0])
    
    if len(classes) == 2 and probs is not None:
        auc_score = roc_auc_score(labels, probs)
        if verbose:
            print('AUC:\t', auc_score)
    else:
        auc_score = None
    return accuracy, average_acc, auc_score
        
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

def lasso_regression(X_train, y_train, X_valid):
    model = Lasso(alpha=0.01)
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_valid)
    # probs_valid = model.predict_proba(X_valid)  ## Lasso does not have a predict_proba method
    probs_valid = model.predict(X_valid)
    return preds_valid, probs_valid

def logistic_regression(X_train, y_train, X_valid):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_valid)
    probs_valid = model.predict_proba(X_valid)
    return preds_valid, probs_valid

## Data

In [6]:
metadata_df = pd.read_csv('../data/all_glioma_metadata_542x30.csv', index_col=0)
metadata_df = metadata_df.set_index('tciaID')
train_idx = metadata_df[metadata_df['phase'] == 'train'].index
val_idx = metadata_df[metadata_df['phase'] == 'val'].index

cna_full_df = pd.read_csv('../data/gistic-subtype-OS-1087-21546.csv', index_col=0)
survival_df = cna_full_df['OS']
cna_full_df = cna_full_df.drop(columns=['OS', 'subtype'])

idh_df = pd.read_csv('../data/idh_825x2.csv', index_col=0)
idh_df = idh_df.sum(axis=1)
idh_df[idh_df > 0] = 1

## Genomic predictions with unlabeled data

In [7]:
from tqdm import tqdm_notebook as tqdm
from sklearn.decomposition import PCA

X_train = cna_full_df[~cna_full_df.index.isin(val_idx)]
X_val = cna_full_df[cna_full_df.index.isin(val_idx)]
X_val = X_val[X_val.index.isin(idh_df.index)]

avg_acc_dict, auc_dict = {}, {}
best_avg_acc = 0

print(X_train.shape[0], 'samples')
for n_components in tqdm([i for i in range(2, 50)]):
    # PCA on the entire dataset (1087 - 75 (val)) | of the 
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_valid_pca = pca.transform(X_val)
    
    # make X_train back into a dataframe to make sure the labels line up with the samples
    pca_columns = ['pca'+str(i) for i in range(n_components)]
    X_train_pca_df = pd.DataFrame(data=X_train_pca, columns=pca_columns, index=X_train.index)
    X_train_pca_df = X_train_pca_df[X_train_pca_df.index.isin(idh_df.index)]
    
    # join the labels back with the training data (again, to make sure they line up)
    train_pca_data_df = pd.concat([idh_df, X_train_pca_df], axis=1, join='inner')
    
    # # logistic regression + L2 
    preds_lr, probs_lr = logistic_regression(X_train=train_pca_data_df[pca_columns], 
                                             y_train=train_pca_data_df[0], 
                                             X_valid=X_valid_pca)
    
    X_val_pca_df = pd.DataFrame(data=X_valid_pca, columns=pca_columns, index=X_val.index)
    X_val_pca_df = X_val_pca_df[X_val_pca_df.index.isin(idh_df.index)]
    val_pca_data_df = pd.concat([idh_df, X_val_pca_df], axis=1, join='inner')

    accuracy, average_acc, auc_score = class_accuracies(preds=preds_lr, 
                                                        labels=val_pca_data_df[0], 
                                                        probs=probs_lr[:,1], 
                                                        classes=['wildtype', 'mutant'],
                                                       verbose=False)
    if average_acc > best_avg_acc:
        print('best average_acc:', average_acc, '\t| dim:', n_components, '\t| AUC', auc_score)
        best_avg_acc = average_acc
    
    avg_acc_dict[n_components] = average_acc
    auc_dict[n_components] = auc_score    

1012 samples


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

best average_acc: 0.9131944444444444 	| dim: 2 	| AUC 0.9652777777777777
best average_acc: 0.931712962962963 	| dim: 3 	| AUC 0.9733796296296297
best average_acc: 0.947337962962963 	| dim: 8 	| AUC 0.9826388888888888


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
print('We pick the AUC with the highest accuracy')
print('AUC', auc_dict[25])
print('Average ACC', avg_acc_dict[25])

We pick the AUC with the highest accuracy
AUC 0.9606481481481481
Average ACC 0.947337962962963


## Genomic predictions without unlabled data

In [9]:
from tqdm import tqdm_notebook as tqdm

avg_acc_dict, auc_dict = {}, {}
best_avg_acc = 0
from sklearn.decomposition import PCA
X_train = cna_full_df[cna_full_df.index.isin(train_idx)]
X_val = cna_full_df[cna_full_df.index.isin(val_idx)]
X_val = X_val[X_val.index.isin(idh_df.index)]

print(X_train.shape[0], 'samples')
for n_components in tqdm([i for i in range(2, 50)]):
    # PCA on the train dataset (160) | of the 
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_valid_pca = pca.transform(X_val)
    
    # make X_train back into a dataframe to make sure the labels line up with the samples
    pca_columns = ['pca'+str(i) for i in range(n_components)]
    X_train_pca_df = pd.DataFrame(data=X_train_pca, columns=pca_columns, index=X_train.index)
    X_train_pca_df = X_train_pca_df[X_train_pca_df.index.isin(idh_df.index)]
    
    # join the labels back with the training data (again, to make sure they line up)
    train_pca_data_df = pd.concat([idh_df, X_train_pca_df], axis=1, join='inner')
    
    # logistic regression + L2
    preds_lr, probs_lr = logistic_regression(X_train=train_pca_data_df[pca_columns], 
                                             y_train=train_pca_data_df[0], 
                                             X_valid=X_valid_pca)
    
    X_val_pca_df = pd.DataFrame(data=X_valid_pca, columns=pca_columns, index=X_val.index)
    X_val_pca_df = X_val_pca_df[X_val_pca_df.index.isin(idh_df.index)]
    val_pca_data_df = pd.concat([idh_df, X_val_pca_df], axis=1, join='inner')

    accuracy, average_acc, auc_score = class_accuracies(preds=preds_lr, 
                                                        labels=val_pca_data_df[0], 
                                                        probs=probs_lr[:,1], 
                                                        classes=['wildtype', 'mutant'],
                                                       verbose=False)
    if average_acc > best_avg_acc:
        print('best average_acc:', average_acc, '\t| dim:', n_components, '\t| AUC', auc_score)
        best_avg_acc = average_acc
    
    avg_acc_dict[n_components] = average_acc
    auc_dict[n_components] = auc_score   

160 samples


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

best average_acc: 0.9131944444444444 	| dim: 2 	| AUC 0.96875
best average_acc: 0.916087962962963 	| dim: 3 	| AUC 0.9710648148148149


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


best average_acc: 0.9346064814814814 	| dim: 14 	| AUC 0.9699074074074074
best average_acc: 0.9658564814814814 	| dim: 15 	| AUC 0.9594907407407407


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
