In [1]:
from numpy.random import seed
seed(1)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *
from collections import namedtuple
import Metrics
from PatientSet import PatientSet
from Constants import Constants
from dependencies.Boruta import BorutaPy
from Clustering import *
import re
import copy

#sklearn dependencies
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, adjusted_rand_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import resample
from sklearn.cluster import FeatureAgglomeration

#we get like a million deprication errors for some reason with the external libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




Using TensorFlow backend.


In [2]:
#load in the patientset object that has all the patient info
db = PatientSet()

#add a bunch of features to the object that we'll want to try
#so we can use the db.to_dataframe function to get them all in a nice dataframe with one-hot encoding and labels automatically
db.t_volumes = np.array([np.sum([g.volume for g in gtvs]) for gtvs in db.gtvs]).reshape(-1,1)
db.tsimdoses = tsim_prediction(db)
db.bilateral = db.lateralities == 'B'
db.total_volumes = db.volumes.sum(axis = 1)
db.toxicity = db.feeding_tubes + db.aspiration > 0
db.xerostima = db.feeding_tubes + db.aspiration > 1

  mean_tumor_distances /= tumor_volume
  tumor_position /= tumor_volume


error reading tumor volume for  10091
error reading tumor volume for  10148






patient data loaded...



In [68]:
#parameters for the experiments
toxicities_to_test = ['toxicity']

#features to test the feature selection on.  should be fields in the patientset we have
#we don't cluster on these
unclusterable_features = ['t_volumes', 'bilateral', 'total_volumes']
#we cluster on these (each individually) if feature_clustering is defined,
clusterable_features =['tumor_distances', 'volumes','tsimdoses']

#features specifically for feature selection vs actually using.  Should either be
#some combo of actual and predicted dose for this
train_features = [] #['doses']
test_features = [] #['tsimdoses']

#number of times to resample and doing feature selection
#if n = 1, just use the first result
n_samples = 500
df_rescale = Metrics.normalize

#for now just constrain it to one cluster
n_clusters = 2

#number of cpus to use when running the feature selection
n_jobs = 8

#make this none if you don't want to cluster the features
#otherwise give a number of clusters to group them, 
#should be <33 as of writting this as that is the total included number of organs
feature_clustering = None

#where to save results, put None if you don't want to save
save_root = 'data/clustering_results/'


In [None]:
%load_ext autoreload
%autoreload 2
from Clustering import *
#our actual experiment, try to find correlations using the boruta method and such
feature_list = []
for tox_name in toxicities_to_test:
    print(tox_name)
    toxicity = getattr(db, tox_name) > 0
    
    #remove eyeball stuff from the candidates since those are missing in some patients
    #and it messe100s up the feature selection due to that noise
    organs = copy.copy(Constants.organ_list)
    for o in Constants.organ_list:
        if re.search('Eyeball', o) is not None:
            organs.remove(o)
    
    train, test = get_train_test_datasets(db, unclusterable_features, 
                                          clusterable_features, 
                                          train_features, 
                                          test_features,
                                         organs,
                                         feature_clustering)
    
    if df_rescale is not None:
        train = df_rescale(train)
        test = df_rescale(test)
    selection_clusterer = BestClusterer()
#     selection_clusterer  = FClusterer(n_clusters, dist_func = l2)
    feature_selector = FeatureClusterSelector(
        n_samples = n_samples,
        model = selection_clusterer,
        n_jobs = n_jobs).fit(train, toxicity)
    to_use = feature_selector.transform(test)
    labels = feature_selector.predict_labels(train, toxicity)
    print()
    print()
    print(get_contingency_table(labels, toxicity))
    print(fisher_exact_test(labels, toxicity))
    print('number of features: ', to_use.shape[1])

    #we're going to try a bunch of different clusterings and look at the best result
    clustering = get_optimal_clustering(to_use.values, 
                                        toxicity,
                                        metric = cluster_metric,
                                       min_clusters = n_clusters,
                                       max_clusters = n_clusters)
    print(clustering[1].method)
    print(get_contingency_table(clustering[0], toxicity))
    print('correlation: ', clustering[1].correlation)
    print('rand score: ', clustering[1].rand_score,'\n')

    to_use['cluster_labels'] = clustering[0]
#     print(best_features.columns)
    to_use.index.rename('Dummy.ID', inplace = True)
    feature_list.append(to_use)
    if save_root is not None:
        n_best_clusters = len(set(clustering[0]))
        to_use.to_csv(save_root
                     + 'boruta_features_k='
                     + str(n_best_clusters)
                     + '_p=' + '{:.3e}'.format(clustering[1].correlation)
                     + '_toxicity=' + tox_name + '.csv')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
toxicity


In [None]:
lambda x: x #%load_ext autoreload
%autoreload 2
from Clustering import *

#get all the features found before and put them together
combined_df = feature_list[0]
if len(feature_list) > 1:
    for i in range(1, len(feature_list)):
        df2 = feature_list[i]
        to_drop = list(set(combined_df.columns).intersection(set(df2.columns)))
        if len(to_drop) == df2.shape[1]:
            continue
        df2 = df2.drop(to_drop, axis = 1)
        combined_df = pd.merge(combined_df, df2, on = 'Dummy.ID')
combined_df.drop('cluster_labels', axis = 1, inplace = True)
print(combined_df.columns)
combined_clusters = get_optimal_clustering(combined_df.values, db.toxicity,
                                    metric = cluster_metric,
                                    min_clusters = n_clusters,
                                    max_clusters = n_clusters)
print(combined_clusters[1].method)
print(get_contingency_table(combined_clusters[0], toxicity))
print('correlation: ', combined_clusters[1].correlation)
print('rand score: ', combined_clusters[1].rand_score, '\n')
combined_df['cluster_labels'] = combined_clusters[0]

In [None]:
if save_root is not None:
    combined_df.to_csv(save_root
                 + 'metaClusteringBootstrapped500MinmaxBest.csv')