In [1]:
from os import environ

import pandas as pd

from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, Normalizer, binarize
from sklearn.svm import LinearSVC

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns



First, we check that the expected variables were passed the the notbook. 

In [None]:
if 'clinical_data' not in environ:
    print "Please set the environment variable clinical_data"
    sys.exit(1)
    
if 'gene_expression_data' not in environ:
    print "Please set the environment variable gene_expression_data"
    sys.exit(1)

Next, we read in the patient and gene expression data. We have to mash up the data a bit so that we can join the two datasets together.

In [6]:
# Load in patient data.
patients_file = environ['clinical_data']
patients = pd.read_csv(patients_file,
    sep="\t",
    usecols=['sample', 'gleason_score'],
    index_col=0)

# Read in gene expression data.
gene_expression_file = environ['gene_expression_data']
gene_expression = pd.read_csv(gene_expression_file,
    sep='\t',
    index_col='gene_id')

# Transpose gene expression data it so we can join with patients. We also
# group by "index", take the first record so that we remove any duplicate
# patients.
gene_expression = gene_expression.\
    T.\
    reset_index().\
    groupby("index").\
    first()

# Rename our index to sample. Now our patients and gene expression data frames
# have the same index names.
gene_expression.index.rename("sample", inplace=True)

# Use only the first 12 characters in the sample id -- the rest is unknown.
gene_expression.index = gene_expression.index.str.slice(0,12)

# Join our tables.
patient_gene_expression = patients.join(gene_expression)

NameError: name 'config' is not defined

Define the feature matrix and response variables we will train on. 

In [7]:
X = patient_gene_expression.drop('gleason_score').as_matrix()
Y = patient_gene_expression.gleason_score >= 8

NameError: name 'patient_gene_expression' is not defined

Define our pipeline that we're going to use for hyperparameter selection, cross-validation, and model building. Then specify the parameter distributions that we're going to search across. 

In [None]:
# Define our pipeline that we're going to use for hyperparameter selection,
# cross-validation, and model building.
pipeline = Pipeline([
    ('imputer', Imputer(missing_values='NaN', strategy='mean', axis=0)),
    ('normalizer', Normalizer()),
    ('feature_selection', SelectFromModel(LinearSVC())),
    ('random_forest', RandomForestClassifier(n_estimators=1000))
])

# Specify parameter distributions that we're going to search across.
parameter_grid = {
    "random_forest__max_depth": [5, 10, 15, 20, 25],
    "random_forest__max_features": [2**n for n in range(3, 12)]
}

# Define a grid search across the parameter distribution for our pipeline.
grid_search = GridSearchCV(pipeline,
    param_grid=parameter_grid,
    n_jobs=8,
    verbose=2)

Perform K-fold nested cross-validation to select hyperparameters using the grid search method. 

In [None]:
# Create a list of results that we're going to append to.
cross_validation_results_list = []
grid_search_results_list = []
support_list = []

# Iterate through stratified k-folds
for fold, (train, test) in enumerate(StratifiedKFold(Y, n_folds=6)):

    print("Iterating through fold #{} of 6.".format(fold+1))

    # Search for best parameters using training data. 
    grid_search.fit(X[train], Y[train])

    # Save grid search parameters
    for grid_score in grid_search.grid_scores_:
        grid_search_result = pd.Series(grid_score.parameters)
        grid_search_result['score'] = grid_score.mean_validation_score
        grid_search_result['fold'] = fold
        grid_search_results_list.append(grid_search_result)

    # Select the best estimator.
    model = grid_search.best_estimator_

    # Get the list of supports selected from the feature_selection step
    support = {
        'fold': fold,
        'support': model.named_steps['feature_selection'].get_support()
    }

    # Add grid search params to our support.
    support.update(grid_search.best_params_)

    # Append this to our list of supports.
    support_list.append(support)

    # Make predictions for the output.
    probabilities = model.predict_proba(X[test])

    # Calculate false/true positive rates
    false_positive_rate, true_positive_rate, roc_thresholds = roc_curve(Y[test], probabilities[:, 1])

    precision, recall, pr_thresholds = precision_recall_curve(Y[test], probabilities[:, 1])

    metrics = {
        'fold': fold+1,
        'false_positive_rate': false_positive_rate,
        'true_positive_rate': true_positive_rate,
        'area_under_curve': auc(false_positive_rate, true_positive_rate),
        'precision': precision,
        'recall': recall,
        'roc_thresholds': roc_thresholds,
        'precision_recall_thresholds': pr_thresholds
    }

    # Add our hyperparameters to our results.
    metrics.update(grid_search.best_params_)

    # Add our results to the data frame so that we can track parameters and 
    cross_validation_results_list.append(metrics)
    
# Convert our results to data frames for easy processing.
support_results = pd.DataFrame(support_list)
cross_validation_results = pd.DataFrame(cross_validation_results_list)
grid_search_results = pd.DataFrame(grid_search_results_list)

#### Model parameters per fold

In [None]:
fig, axes = plt.subplots(2, 3, sharex='col', sharey='row')

for fold, ax in enumerate(axes.flatten()):

    # Look at the search results for this fold.
    fold_grid_search_results = grid_search_results[grid_search_results.fold == fold].\
        drop('fold', 1).\
        pivot('random_forest__max_depth', 'random_forest__max_features')
    
    x, y = meshgrid(fold_grid_search_results.columns.levels[1].values,
            fold_grid_search_results.index.values)

    z = fold_grid_search_results.values

    ax.contourf(x, y, z)

    ax.set_xscale('log', basex=2)

fig.suptitle("Random Forest Grid Search Results Per Fold")
fig.text(0.5, 0.02, 'Feature Count', ha='center')
fig.text(0.04, 0.5, 'Depth', va='center', rotation='vertical')
fig.savefig("random-forest-parameters-per-fold.png")

#### Receiver operating characteristic curve per fold

In [None]:
fig = figure()
ax = fig.gca()

ax.plot([0, 1], [0, 1], 'k--')

for _, row in cross_validation_results.iterrows():
    ax.plot(row.false_positive_rate, row.true_positive_rate)

ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")
fig.suptitle("Receiver operating characteristic curve per fold")

#### Precision and Recall vs Threshold per Fold

In [None]:
fig = figure()
ax = fig.gca()

for _, row in cross_validation_results.iterrows():
    ax.plot(row.precision_recall_thresholds, row.precision[:-1], 'r')
    ax.plot(row.precision_recall_thresholds, row.recall[:-1], 'b')

ax.set_xlabel("Threshold")
ax.set_ylabel("Precision / Recall")
fig.suptitle("Precision and Recall vs Threshold per Fold")

Finally, perform a grid search using all available data and save the resulting model. 

In [None]:
models = grid_search.fit(X, Y)

with open('pipeline.pickle', 'wb') as f:
    pickle.dump(models.best_estimator_, f)