In [10]:
%matplotlib inline
from data_tools import *
from algorithms import *
from plot_lib import *
from nets import *
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from sklearn.metrics import log_loss
import numpy as np
import tensorflow as tf
import code 

In [11]:
# Paths
path_train_data = "/Volumes/MoritzBertholdHD/CellData/Experiments/Ex1/PreparedData/all_channels_80_80_full_no_zeros_in_cells.npy"
path_train_labels = "/Volumes/MoritzBertholdHD/CellData/Experiments/Ex1/PreparedData/labels_80_80_full_no_zeros_in_cells.npy"
path_test_data = "/Volumes/MoritzBertholdHD/CellData/Experiments/Ex2/PreparedData/all_channels_80_80_full_no_zeros_in_cells.npy"
path_test_labels = "/Volumes/MoritzBertholdHD/CellData/Experiments/Ex2/PreparedData/labels_80_80_full_no_zeros_in_cells.npy"

print "Loading training and test data"
X_train = np.array(loadnumpy(path_train_data), dtype = np.uint8).astype('float32')[:,:,::2,::2]
y_train = np.load(path_train_labels)[:,0]
X_test = np.array(loadnumpy(path_test_data), dtype = np.uint8).astype('float32')[:,:,::2,::2]
y_test = np.load(path_test_labels)[:,0]
print "done"

Loading training and test data
done


In [12]:
# Remove class 5 from Data:
print "Removing the last class for comparison with cell profiler"
X_train = X_train[y_train!=4, :]
y_train = y_train[y_train!=4]
X_test = X_test[y_test!=4, :]
y_test = y_test[y_test!=4]
print "Distribution train classes", np.unique(y_train, return_counts=True)
print "Distribution test classes", np.unique(y_test, return_counts=True)
print "done"

Removing the last class for comparison with cell profiler
Distribution train classes (array([0, 1, 2, 3]), array([ 3376,  1433,  8270, 11112]))
Distribution test classes (array([0, 1, 2, 3]), array([ 4137,  1833, 11191, 14506]))
done


In [13]:
print "Normalizing data per channel"
max_ch1 = float(np.max(X_train[:, 0, : , :]))
max_ch2 = float(np.max(X_train[:, 1, : , :]))
max_ch3 = float(np.max(X_train[:, 2, : , :]))
max_ch4 = float(np.max(X_train[:, 3, : , :]))
X_train[:, 0, : , :] /= max_ch1
X_train[:, 1, : , :] /= max_ch2
X_train[:, 2, : , :] /= max_ch3
X_train[:, 3, : , :] /= max_ch4
X_test[:, 0, : , :] /= max_ch1
X_test[:, 1, : , :] /= max_ch2
X_test[:, 2, : , :] /= max_ch3
X_test[:, 3, : , :] /= max_ch4
print "------------ Check Data -------------"
print "Trainingdata shape = ", X_train.shape
print "Traininglabels shape = ", y_train.shape
print "Testdata shape = ", X_test.shape
print "Testlabels shape = ", y_test.shape
print "Max val: ", np.max(X_train[:,0,:,:])
print "Max val: ", np.max(X_train[:,1,:,:])
print "Max val: ", np.max(X_train[:,2,:,:])
print "Max val: ", np.max(X_train[:,3,:,:])
print "Max val: ", np.max(X_test[:,0,:,:])
print "Max val: ", np.max(X_test[:,1,:,:])
print "Max val: ", np.max(X_test[:,2,:,:])
print "Max val: ", np.max(X_test[:,3,:,:])
print "-------------------------------------"
print "done"

Normalizing data per channel
------------ Check Data -------------
Trainingdata shape =  (24191, 4, 40, 40)
Traininglabels shape =  (24191,)
Testdata shape =  (31667, 4, 40, 40)
Testlabels shape =  (31667,)
Max val:  1.0
Max val:  1.0
Max val:  1.0
Max val:  1.0
Max val:  1.0
Max val:  1.0
Max val:  1.0
Max val:  1.0
-------------------------------------
done


In [14]:
print "Reshaping data for convNet"
X_train = X_train.reshape(X_train.shape[0], X_train.shape[3], X_train.shape[2], X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[3], X_test.shape[2], X_test.shape[1])
print "done"
print "Training shape", X_train.shape
print "Test shape", X_test.shape

Reshaping data for convNet
done
Training shape (24191, 40, 40, 4)
Test shape (31667, 40, 40, 4)


In [15]:
def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a

def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv

def run_cross_validation_create_models(nfolds, X_train, X_test, y_train):
    # input image dimensions
    batch_size = 8
    nb_epoch = 12
    random_state = 51

    train_data = X_train
    train_target = y_train

    yfull_train = dict()
    kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    accuracies = 0
    models = []
    for train_index, test_index in kf:
        model = covNetSimple()
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=5, verbose=0),
        ]
        
        m = len(Y_train)
        uniques, frequencies = np.unique(Y_train, return_counts=True)
        weights = {0:(m/frequencies[0]), 1:(m/frequencies[1]), 2:(m/frequencies[2]), 3:(m/frequencies[3])}
        
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, class_weight=None, shuffle=True, verbose=2, validation_data=(X_valid, Y_valid), callbacks=callbacks)

        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        test_prediction = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        y_pred = np.zeros([test_prediction.shape[0]])
        for i in xrange(test_prediction.shape[0]):
            y_pred[i] = np.argmax(test_prediction[i,:]).astype(int)
        class_names = ["0", "1", "2", "3"]
        plotNiceConfusionMatrix(Y_valid.astype(int), y_pred.astype(int), class_names)
        scores = model.evaluate(X_valid.astype('float32'), Y_valid, verbose=0)
        print y_pred.shape
        print Y_valid.shape
        acc = accuracy(Y_valid.astype(int), y_pred.astype(int))
        print "Accuracy is: ", acc
        
        accuracies += acc


        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    print "no accuracy evaluation!"
    final_accuracy = accuracies / nfolds
    print "Accuracy train independent avg in percent: ", final_accuracy

    info_string = 'loss_' + str(score) + '_folds_' + str(nfolds) + '_ep_' + str(nb_epoch)
    return info_string, models

In [16]:
def process_test_with_cross_val(info_string, models, X_test):
    batch_size = 16
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_data = X_test
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string \
                + '_folds_' + str(nfolds)

    print "Result on test data done: ", test_res.shape
    return test_res

In [17]:
num_folds = 4
print "Training model with num_folds = ", num_folds
info_string, models = run_cross_validation_create_models(num_folds, X_train, X_test, y_train)
print "done"

Training model with num_folds =  4
Start KFold number 1 from 4
('Split train: ', 18143, 18143)
('Split valid: ', 6048, 6048)


ValueError: Error when checking model input: expected convolution2d_input_3 to have shape (None, 80, 80, 4) but got array with shape (18143, 40, 40, 4)

In [None]:
print "-------------------------------------"
print "Evaluation on test data:"
prediction = process_test_with_cross_val(info_string, models, X_test)
y_pred = np.argmax(prediction, axis=1)
print "The final accuracy on test data is " + str(accuracy(y_pred, y_test)) + "%."
class_names = ["0", "1", "2", "3"]
plotNiceConfusionMatrix(y_test, y_pred, class_names)