In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from cdwiener import array_fptd
import os
import pandas as pd
import time
from datetime import datetime
import pickle
import yaml
import keras_to_numpy as ktnp
import glob

from kde_training_utilities import kde_load_data
from kde_training_utilities import kde_make_train_test_split

In [None]:
# CHOOSE ---------
method = "weibull_cdf_ndt" # ddm, linear_collapse, ornstein, full, lba
machine = 'x7'
# ----------------

# INITIALIZATIONS ----------------------------------------------------------------
stats = pickle.load(open("kde_stats.pickle", "rb"))[method]
dnn_params = yaml.load(open("hyperparameters.yaml"))

if machine == 'x7':
    data_folder = stats["data_folder_x7"]
    model_path = stats["model_folder_x7"]
else:
    data_folder = stats["data_folder"]
    model_path = stats["model_folder"]
    
model_path += dnn_params["model_type"] + "_{}_".format(method) + datetime.now().strftime('%m_%d_%y_%H_%M_%S') + "/"

print('if it does not exist, make model path')

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
# Copy hyperparameter setup into model path
if machine == 'x7':
    os.system("cp {} {}".format("/media/data_cifs/afengler/git_repos/nn_likelihoods/hyperparameters.yaml", model_path))
else:
    os.system("cp {} {}".format("/users/afengler/git_repos/nn_likelihoods/hyperparameters.yaml", model_path))
    
# set up gpu to use
if machine == 'x7':
    os.environ["CUDA_DEVICE_ORDER"]= "PCI_BUS_ID"   # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = dnn_params['gpu_x7'] 

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Load the training data
print('loading data.... ')

# X, y, X_val, y_val = kde_load_data(folder = data_folder, 
#                                    return_log = True, # Dont take log if you want to train on actual likelihoods
#                                    prelog_cutoff = 1e-7 # cut out data with likelihood lower than 1e-7
#                                   )

# X = np.array(X)
# X_val = np.array(X_val)
# --------------------------------------------------------------------------------

In [None]:
# MAKE MODEL ---------------------------------------------------------------------
print('Setting up keras model')

input_shape = 8 #X.shape[1]
model = keras.Sequential()

for i in range(len(dnn_params['hidden_layers'])):
    if i == 0:
        model.add(keras.layers.Dense(units = dnn_params["hidden_layers"][i], 
                                     activation = dnn_params["hidden_activations"][i], 
                                     input_dim = input_shape))
    else:
        model.add(keras.layers.Dense(units = dnn_params["hidden_layers"][i],
                                     activation = dnn_params["hidden_activations"][i]))
        
# Write model specification to yaml file        
spec = model.to_yaml()
open(model_path + "model_spec.yaml", "w").write(spec)


print('STRUCTURE OF GENERATED MODEL: ....')
print(model.summary())

if dnn_params['loss'] == 'huber':
    model.compile(loss = tf.losses.huber_loss, 
                  optimizer = "adam", 
                  metrics = ["mse"])

if dnn_params['loss'] == 'mse':
    model.compile(loss = 'mse', 
                  optimizer = "adam", 
                  metrics = ["mse"])
# ---------------------------------------------------------------------------

In [None]:
def generate_samples(path):
    while True:
        files_ = glob.glob(path + '/data_*')
        files_ = np.random.permutation(files_)
        for file_ in files_:
            with open(file_, 'rb') as f:
                data = pickle.load(f)
                np.random.shuffle(data.values)
                data.reset_index(drop = True, inplace = True)
                n = data.shape[0]
                n_cols = data.shape[1]
                batch_size = 200000
                i = 0
                while (i * batch_size < (n - batch_size)):
                    yield (data.iloc[(i * batch_size): ((i + 1) * batch_size ), :(n_cols - 1)].to_numpy(), 
                           np.expand_dims(data.iloc[(i * batch_size): ((i + 1) * batch_size ), (n_cols - 1)].to_numpy() , axis = 1))
            
                    i += 1

In [None]:
# FIT MODEL -----------------------------------------------------------------
print('Starting to fit model.....')

# Define callbacks
ckpt_filename = model_path + "model.h5"

checkpoint = keras.callbacks.ModelCheckpoint(ckpt_filename, 
                                             monitor = 'val_loss', 
                                             verbose = 1, 
                                             save_best_only = False)
                               
earlystopping = keras.callbacks.EarlyStopping(monitor = 'val_loss', 
                                              min_delta = 0, 
                                              verbose = 1, 
                                              patience = 2)

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', 
                                              factor = 0.1,
                                              patience = 1, 
                                              verbose = 1,
                                              min_delta = 0.0001,
                                              min_lr = 0.0000001)

history = model.fit_generator(generate_samples('/media/data_cifs/afengler/data/kde/' + \
                              'weibull_cdf/train_test_data_ndt_20000/'),
                              steps_per_epoch = 1000,
                              epochs = 50,
                              #batch_size = dnn_params["batch_size"], 
                              #callbacks = [checkpoint, reduce_lr, earlystopping], 
                              verbose = 1,
                              max_queue_size = 25)
# ---------------------------------------------------------------------------

In [None]:
import numpy as np
import glob
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, 
                 list_IDs, 
                 file_IDs, # np.array
                 labels,
                 batch_size = 32,
                 dim = (32,32,32), 
                 n_channels = 1,
                 n_classes = 10, 
                 shuffle = True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.file_IDs = file_IDs
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.max_index = labels.shape[0]
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        
        # Load new file into memory if index signals end of file
        if index == self.max_index or index == 0:
            np.random.shuffle(self.file_IDs)
            
        
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        #X = np.empty((self.batch_size, *self.dim, self.n_channels))
        #y = np.empty((self.batch_size), dtype=int)
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size, 1))
        
        # Generate data
        
        # IF ID signals e-o-f
        # Load new file
        
        # Else yield ...
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load('data/' + ID + '.npy')

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes = self.n_classes)

In [None]:
# SAVING --------------------------------------------------------------------
# print('Saving model and relevant data...')
# # Log of training output
# pd.DataFrame(history.history).to_csv(model_path + "training_history.csv")

# # Save Model
# model.save(model_path + "model_final.h5")

# # Extract model architecture as numpy arrays and save in model path
# __, ___, ____, = ktnp.extract_architecture(model, save = True, save_path = model_path)

# # Update model paths in model_path.yaml
# model_paths = yaml.load(open("model_paths.yaml"))
# model_paths[method] = model_path
# yaml.dump(model_paths, open("model_paths.yaml", "w"))
# ----------------------------------------------------------------------------

In [None]:
%%timeit -n 1 -r 1
t = pickle.load(open('/media/data_cifs/afengler/data/kde/' + \
                     'weibull_cdf/train_test_data_ndt_20000/data_77ff8ed6fa2411e9aea9073b18a43faf.pickle', 'rb'))

In [None]:
t2 = np.array(t)

In [None]:
np.save('/media/data_cifs/afengler/data/tmp/npsavetest.npy', t2)

In [None]:
%%timeit -n 1 -r 1
test = np.load('/media/data_cifs/afengler/data/tmp/npsavetest.npy')

In [None]:
t = pd.DataFrame(np.random.uniform(size = (100000, 10)))

In [None]:
my_gen = generate_samples(path = '/media/data_cifs/afengler/data/kde/' + \
                          'weibull_cdf/train_test_data_ndt_20000/')

In [None]:
t2 = next(my_gen)

In [None]:
t2

In [None]:
t2[0].to_numpy().shape

In [None]:
t2[1].values.shape

In [None]:
np.expand_dims(t2[1].to_numpy(), axis = 1).shape

In [None]:
(path = '',
                              p_train = 0.8,
                              n_files_out = 10,
                              file_in_list = 'all')

In [None]:
#glob.glob('*')

In [None]:
glob.glob('/media/data_cifs/afengler/data/kde/weibull_cdf/train_test_data_ndt_20000/data_*')

In [2]:
kde_make_train_test_split(path = '/media/data_cifs/afengler/data/kde/weibull_cdf/train_test_data_ndt_20000/',
                         n_files_out = 10,
                         file_in_list = ['data_e6f83854fa2111e999dcf5d97ddd6f15.pickle'])

check if we have a train and test sets already
read, concatenate and shuffle data
get training and test indices
writing to file...


'success'

In [None]:
pd.read_pickle('/media/data_cifs/afengler/data/kde/weibull_cdf/train_test_data_ndt_20000/data_e6f83854fa2111e999dcf5d97ddd6f15.pickle')