In [42]:
import numpy as np
import itertools as it
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [12]:
def tuneHyperparameters(classifier, training_set_X, training_set_Y, hyperparameters, number_of_folds, type_of_score):
    """This function runs k-fold cross validation on given set of parameters and returns best combo of parameters (according to specified model performance metric)
    
    Parameters:
    ----------
    -classifier: base classifer, parameters will be set during this function
    -training_set_X: X data
    -training_set_Y: Y data
    -hyperparameters: dictionary of parameter name to options
    -number_of_folds: number of k-fold folds
    -type_of_score: performance metric used to compare hyperparameters, options are "recall", "precision", "f1", or "accuracy"
    
    Returns: 
    -------
    Dictionary of best parameter values, for example {'C': 2, 'penalty': 'l2'}
    
    Example:
    --------
    parameters = {'C': [3,2,5,6], 'penalty': ['l2']}
    tuneHyperparameters(LogisticRegression(), X, Y, parameters, 5, 'accuracy')
    """
    
    kf = KFold(n_splits= number_of_folds)
    kf.get_n_splits(training_set_X)
    
    allNames = sorted(hyperparameters)
    parameter_combos = it.product(*(hyperparameters[Name] for Name in allNames))
    
    metrics = []
    params = []
    for hyperparameter_combo in parameter_combos:
        params.append(hyperparameter_combo)
        for p in range(len(allNames)):
            classifier.set_params(**{allNames[p]: hyperparameter_combo[p]})
              
        metrics.append(kFoldCrossValidation(classifier, training_set_X, training_set_Y, number_of_folds, type_of_score, kf))
    print metrics
    best_params = params[metrics.index(max(metrics))]
    return {allNames[i]: best_params[i] for i in range(len(best_params))}
  

In [14]:
def kFoldCrossValidation(classifier, training_set_X, training_set_Y, number_of_folds, type_of_score = 'all', kf = None):
    """Function that returns cross-validated model performance score
    
    Parameters:
    ----------
    -classifer: supervised learning model
    -training_set_X: the X data 
    -training_set_Y: the Y data 
    -number_of_folds: number of cross validation folds
    -type_of_score: performance metric to return, options are "recall", "precision", "f1", "accuracy", or "all"
    -kf: folds to consider
    
    Return:
    ------
    Float or list depending on type_of_score argument representing the model's performance
    
    Example:
    --------
    kFoldCrossValidation(LogisticRegression(penalty='l2', tol=0.0001, C=1.0) , X, Y, 5, 'recall')
    """
    if kf == None:
        kf = KFold(n_splits= number_of_folds)
        kf.get_n_splits(training_set_X)

    precision = []
    recall = []
    f1_score = []
    accuracy = []

    for train_index, test_index in kf.split(training_set_X):
        X_train, X_validation = training_set_X[train_index], training_set_X[test_index]
        Y_train, Y_validation = training_set_Y[train_index], training_set_Y[test_index]

        classifier.fit(X_train, Y_train)

        y_pred = classifier.predict(X_validation)
        prec, rec, f1, sup = precision_recall_fscore_support(Y_validation, y_pred, average= "binary")
        acc = accuracy_score(Y_validation, y_pred)

        precision.append(prec)
        recall.append(rec)
        f1_score.append(f1)
        accuracy.append(acc)

    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1_score)
    mean_accuracy = np.mean(accuracy)
    
    if type_of_score == "accuracy":
        return mean_accuracy
    if type_of_score == "precision":
        return mean_precision
    if type_of_score == "f1":
        return mean_f1
    if type_of_score == "recall":
        return mean_recall
    else:
        return [mean_precision, mean_recall, mean_f1, mean_accuracy]



In [47]:
def test_model(training_set_X, test_set_X, training_set_Y, test_set_Y, model, hyperparameters):
    """Function that finds test set predictions 
    
    Parameters:
    ----------
    -model: base supervised learning model
    -hyperparameters: dictionary of parameter name to options, like {'C': 1.0, 'penalty': 'l2'}
    
    Return:
    ------
    Saves a csv called "results.csv"- first column is predicted, second is actual
    
    Example:
    -------
    test_model(trainX, testX, trainY, testY, LogisticRegression(), {'C': 1.0, 'penalty': 'l2'})
    """
    
    allNames = sorted(hyperparameters)
    for p in allNames:
        model.set_params(**{p: hyperparameters[p]})
    
    model.fit(training_set_X, training_set_Y)
    print model.feature_importances_
    y_pred = model.predict(test_set_X)
    probabilities = model.predict_proba(test_set_X)
    
    prec, rec, f1, sup = precision_recall_fscore_support(test_set_Y, y_pred, average= "binary")
    acc = accuracy_score(test_set_Y, y_pred)

    print "Accuracy is:", acc, "Recall is:", rec
    results = [y_pred, test_set_Y, probabilities[:,1]]
    numpy_results = np.transpose((results))
    np.savetxt("results_PET.csv", numpy_results, delimiter=",")



In [48]:
import pandas as pd
import numpy as np

data_train = pd.read_csv('ImputedMatrix_train.csv', header = 0)
data_test = pd.read_csv('ImputedMatrix_test.csv', header = 0)

data_train = data_train[['FDG_MaxTime', 'FDG_Delta', 'FDG_Mean','FDG_Std', 'AV45_MaxTime','AV45_Delta', 'AV45_Mean', 'AV45_Std','PIB_MaxTime','PIB_Delta', 'PIB_Mean', 'PIB_Std', 'Diagnostics']]
data_test = data_test[['FDG_MaxTime', 'FDG_Delta', 'FDG_Mean','FDG_Std', 'AV45_MaxTime','AV45_Delta', 'AV45_Mean', 'AV45_Std', 'PIB_MaxTime','PIB_Delta', 'PIB_Mean', 'PIB_Std','Diagnostics']]

training_set_Y = np.array(data_train['Diagnostics'].tolist())
test_set_Y = np.array(data_test['Diagnostics'].tolist())

del data_train['Diagnostics']
del data_test['Diagnostics']

training_set_X = np.matrix(data_train)
test_set_X = np.matrix(data_test)

best_params = tuneHyperparameters(RandomForestClassifier(), training_set_X, training_set_Y, {'n_estimators': [10,50,100,200,300,500]}, 5, 'recall')

test_model(training_set_X, test_set_X, training_set_Y, test_set_Y, RandomForestClassifier(), best_params)

[0.55352615717385145, 0.66295665420301897, 0.56083414116235064, 0.62564959344768234, 0.62318653926049028, 0.62323995489346551]
[ 0.01478254  0.1039612   0.38751827  0.07831245  0.01870226  0.05358777
  0.2422466   0.06382392  0.          0.00497629  0.01727738  0.01481132]
Accuracy is: 0.780487804878 Recall is: 0.764227642276
