# Projeto de Classificação para Marinha do Brasil

## Autor: Vinícius dos Santos Mello (viniciusdsmello@poli.ufrj.br)

Laboratório de Processamento de Sinais - UFRJ

Laboratório de Tecnologia Sonar

Instituto de Pesquisas da Marinha - IPqM

In [1]:
import os
import pickle
import numpy as np
import time

from keras.utils import np_utils
from keras.models import load_model

from sklearn import preprocessing
from sklearn import metrics
from sklearn.externals import joblib

import matplotlib.pyplot as plt

from Functions import TrainParameters as trnparams
from Functions import TrainFunctions

import multiprocessing 

init_time = time.time()

m_time = time.time()
print 'Time to import all libraries: '+str(m_time-init_time)+' seconds'

analysis_name = 'StackedAutoEncoder'

# Enviroment variables
data_path = os.getenv('OUTPUTDATAPATH')
results_path = os.getenv('PACKAGE_NAME')

# paths to export results
base_results_path = '%s/%s'%(results_path,analysis_name)
pict_results_path = '%s/pictures_files'%(base_results_path)
files_results_path = '%s/output_files'%(base_results_path)

# For multiprocessing purpose
num_processes = multiprocessing.cpu_count()

# Read data
m_time = time.time()

# Database caracteristics
database = '4classes'
n_pts_fft = 1024
decimation_rate = 3
spectrum_bins_left = 400
development_flag = False
development_events = 400

# Check if LofarData has created...
if not os.path.exists('%s/%s/lofar_data_file_fft_%i_decimation_%i_spectrum_left_%i.jbl'%
                      (data_path,database,n_pts_fft,decimation_rate,spectrum_bins_left)):
    print 'No Files in %s/%s\n'%(data_path,database)
else:
    #Read lofar data
    [data,trgt,class_labels] = joblib.load('%s/%s/lofar_data_file_fft_%i_decimation_%i_spectrum_left_%i.jbl'%
                                           (data_path,database,n_pts_fft,decimation_rate,spectrum_bins_left))


    m_time = time.time()-m_time
    print 'Time to read data file: '+str(m_time)+' seconds'

    # correct format
    all_data = data
    all_trgt = trgt

    # turn targets in sparse mode
    from keras.utils import np_utils
    trgt_sparse = np_utils.to_categorical(all_trgt.astype(int))
    
    # Process data
    # unbalanced data to balanced data with random data creation of small classes

    # Same number of events in each class
    qtd_events_biggest_class = 0
    biggest_class_label = ''

    for iclass, class_label in enumerate(class_labels):
        if sum(all_trgt==iclass) > qtd_events_biggest_class:
            qtd_events_biggest_class = sum(all_trgt==iclass)
            biggest_class_label = class_label
        print "Qtd event of %s is %i"%(class_label,sum(all_trgt==iclass))
    print "\nBiggest class is %s with %i events"%(biggest_class_label,qtd_events_biggest_class)

    balanced_data = {}
    balanced_trgt = {}

    from Functions import DataHandler as dh
    m_datahandler = dh.DataHandlerFunctions()

    for iclass, class_label in enumerate(class_labels):
        if development_flag:
            class_events = all_data[all_trgt==iclass,:]
            if len(balanced_data) == 0:
                balanced_data = class_events[0:development_events,:]
                balanced_trgt = (iclass)*np.ones(development_events)
            else:
                balanced_data = np.append(balanced_data,
                                          class_events[0:development_events,:], 
                                          axis=0)
                balanced_trgt = np.append(balanced_trgt,(iclass)*np.ones(development_events))
        else:
            if len(balanced_data) == 0:
                class_events = all_data[all_trgt==iclass,:]
                balanced_data = m_datahandler.CreateEventsForClass(
                    class_events,qtd_events_biggest_class-(len(class_events)))
                balanced_trgt = (iclass)*np.ones(qtd_events_biggest_class)
            else:
                class_events = all_data[all_trgt==iclass,:]
                created_events = (m_datahandler.CreateEventsForClass(all_data[all_trgt==iclass,:],
                                                                     qtd_events_biggest_class-
                                                                     (len(class_events))))
                balanced_data = np.append(balanced_data,created_events,axis=0)
                balanced_trgt = np.append(balanced_trgt,
                                          (iclass)*np.ones(created_events.shape[0]),axis=0)
        
    all_data = balanced_data
    all_trgt = balanced_trgt

    # turn targets in sparse mode
    from keras.utils import np_utils
    trgt_sparse = np_utils.to_categorical(all_trgt.astype(int))

Using Theano backend.


Time to import all libraries: 2.38418579102e-05 seconds
Time to read data file: 1.07007789612 seconds
Qtd event of 0 is 12939
Qtd event of 1 is 29352
Qtd event of 2 is 11510
Qtd event of 3 is 23760

Biggest class is 1 with 29352 events
DataHandler Class: CreateEventsForClass
Original Size: (12939, 400)
DataHandler Class: CreateEventsForClass
Original Size: (29352, 400)
DataHandler Class: CreateEventsForClass
Original Size: (11510, 400)
DataHandler Class: CreateEventsForClass
Original Size: (23760, 400)


In [81]:
%time 
# Load train parameters

analysis_str = 'StackedAutoEncoder'
model_prefix_str = 'RawData'

trn_params_folder='%s/%s/%s_trnparams.jbl'%(results_path,analysis_str,analysis_name)
os.remove(trn_params_folder)
if not os.path.exists(trn_params_folder):
    trn_params = trnparams.SAENoveltyDetectionTrnParams(n_inits=1,
                                                       hidden_activation='tanh', # others tanh, relu, sigmoid, linear 
                                                       output_activation='linear',
                                                       n_epochs=500,
                                                       patience=30,
                                                       batch_size=128,
                                                       verbose=False)
    trn_params.save(trn_params_folder)
else:
    trn_params = trnparams.SAENoveltyDetectionTrnParams()
    trn_params.load(trn_params_folder)
    
# Choose how many fold to be used in Cross Validation
n_folds = 2
CVO = trnparams.NoveltyDetectionFolds(folder=results_path,n_folds=n_folds,trgt=all_trgt,dev=development_flag)
print trn_params.get_params_str()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs
1_inits_mapstd_norm_500_epochs_128_batch_size_tanh_hidden_activation_linear_output_activation


# Treinamento da 1 ª Camada - Variação de Neurônios

In [3]:
# Train example

# Choose neurons topology
max_n_neurons = 475
min_n_neurons = 0
neurons_step = 75

verbose = False

# Create neurons vector to be used in multiprocessing.Pool()
neurons_mat = [1,100,200,250,300,350,400,450]  #range(min_n_neurons, max_n_neurons, neurons_step)
print trn_params.get_params_str()
start_time = time.time()
for inovelty, novelty_class in enumerate(np.unique(trgt)):
    trn_data = all_data[all_trgt!=novelty_class]
    trn_trgt = all_trgt[all_trgt!=novelty_class]

    trn_trgt[trn_trgt>novelty_class] = trn_trgt[trn_trgt>novelty_class]-1
    
    if inovelty != 0:
        print ''
    print 'Novelty class: %i'%inovelty
    
    def trainNeuron(ineuron):
        n_folds = len(CVO[inovelty])
        for ifold in range(n_folds):
            #print 'Neuron value: %i - fold %i'%(ineuron, ifold)
            TrainFunctions.SAENoveltyTrainFunction(data=trn_data,
                                                       trgt=trn_data,
                                                       inovelty=inovelty,
                                                       ifold=ifold,
                                                       n_folds=n_folds,
                                                       n_neurons=ineuron,
                                                       trn_params=trn_params,
                                                       save_path=results_path,
                                                       layer = 1,
                                                       verbose=verbose,
                                                       dev=development_flag)
    
    
    # Start Parallel processing
    p = multiprocessing.Pool(processes=num_processes)
    
    # To train on multiple cores sweeping the number of neurons
    results = p.map(trainNeuron, neurons_mat)
            
    p.close()
    p.join()        

end_time = time.time() - start_time
print "It took %.3f seconds to perform the training"%(end_time)

2_inits_mapstd_norm_500_epochs_256_batch_size_tanh_hidden_activation_linear_output_activation
Novelty class: 0

Novelty class: 1

Novelty class: 2

Novelty class: 3
It took 5.950 seconds to perform the training


# Treinamento da 1ª Camada - Número de Neurônios definido


In [None]:
# Choose neurons topology
ineuron = 400

verbose = False

print trn_params.get_params_str()

start_time = time.time()
for inovelty, novelty_class in enumerate(np.unique(trgt)):
    trn_data = all_data[all_trgt!=novelty_class]
    trn_trgt = all_trgt[all_trgt!=novelty_class]

    trn_trgt[trn_trgt>novelty_class] = trn_trgt[trn_trgt>novelty_class]-1
    
    if inovelty != 0:
        print ''
    print 'Novelty class: %i'%inovelty
    
    # Array with folds to be trained in parallel
    folds = range(len(CVO[inovelty]))
    
    def trainFold(ifold):
        n_folds = len(CVO[inovelty])
        #print 'Neuron value: %i - fold %i'%(ineuron, ifold)
        TrainFunctions.SAENoveltyTrainFunction(data=trn_data,
                                                   trgt=trn_data,
                                                   inovelty=inovelty,
                                                   ifold=ifold,
                                                   n_folds=n_folds,
                                                   n_neurons=ineuron,
                                                   trn_params=trn_params,
                                                   save_path=results_path,
                                                   layer = 1, # Choose the layer to be trained
                                                   verbose=verbose,
                                                   dev=development_flag)
    
    
    # Start Parallel processing
    p = multiprocessing.Pool(processes=num_processes)
    
    # To train on multiple cores sweeping the number of neurons
    results = p.map(trainFold, folds)
            
    p.close()
    p.join()        

end_time = time.time() - start_time
print "It took %.3f seconds to perform the training"%(end_time)

1_inits_mapstd_norm_500_epochs_128_batch_size_tanh_hidden_activation_linear_output_activation
Novelty class: 0
Neuron: 400 - Fold 1 of 2 Folds -  Init 1 of 1 Inits
Neuron: 400 - Fold 2 of 2 Folds -  Init 1 of 1 Inits


In [None]:
# Reconstruction of Known Classes vs Reconstruction of Novelty - novelty detection for neural network
%matplotlib inline 

# generate analysis data
save_path=results_path

analysis_str = 'StackedAutoEncoder'
model_prefix_str = 'RawData'
verbose = False

#os.remove(analysis_file_name)

ineuron = 400

# Choose class to plot
iclass = 0
# Choose model
inovelty = 0

# Plot parameters
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['legend.handlelength'] = 3
plt.rcParams['legend.borderpad'] = 0.3
m_colors = ['b', 'r', 'g', 'y']
# Check if the analysis has already been performed
trn_params_folder='%s/%s/%s_trnparams.jbl'%(results_path,analysis_str,analysis_name)

if not os.path.exists(trn_params_folder):
    trn_params = trnparams.NNNoveltyDetectionTrnParams(n_inits=2,
                                                       hidden_activation='tanh',
                                                       output_activation='linear',
                                                       n_epochs=500,
                                                       patience=30,
                                                       batch_size=256,
                                                       verbose=False)
    trn_params.save(trn_params_folder)
else:
    trn_params = trnparams.NNNoveltyDetectionTrnParams()
    trn_params.load(trn_params_folder)

params_str = trn_params.get_params_str()
n_folds = 2
CVO = trnparams.NoveltyDetectionFolds(folder=results_path,n_folds=n_folds,trgt=all_trgt,dev=development_flag)

novelty_class = float(inovelty)

models = {}
outputs = {}
mean = {}
indexes = {}

n_folds = len(CVO[inovelty])

for ifold in range(n_folds):
    train_id, test_id = CVO[inovelty][ifold]
    
    # normalize data based in train set
    if trn_params.params['norm'] == 'mapstd':
        scaler = preprocessing.StandardScaler().fit(all_data[train_id,:])
    elif trn_params.params['norm'] == 'mapstd_rob':
        scaler = preprocessing.RobustScaler().fit(all_data[train_id,:])
    elif trn_params.params['norm'] == 'mapminmax':
        scaler = preprocessing.MinMaxScaler().fit(all_data[train_id,:])

    norm_data = scaler.transform(all_data)
   
    trgt_data = all_trgt
    trn_data = norm_data[trgt_data==iclass]
     
    if ifold == 0:
        diffSquared = np.zeros([len(CVO),trn_data.shape[0],trn_data.shape[1]])
    
    print 'Novelty class: %01.0f - neuron: %i - fold %i'%(novelty_class, ineuron, ifold)
    
    neurons_str = str(data.shape[1]) + 'x' + str(ineuron)
    model_str = '%s/%s/%s_%i_novelty_%i_folds_%s_%s_neurons'%(results_path,analysis_str,
                                                               model_prefix_str,inovelty,
                                                               n_folds,params_str,
                                                               neurons_str)
    
    file_name = '%s_fold_%i_model.h5'%(model_str,ifold)
    
    if verbose: 
        print file_name
    
    if not os.path.exists(file_name):
        print 'File %s does not exist'%file_name
        break
    models[ifold] = load_model(file_name)
    
    outputs[ifold] = models[ifold].predict(trn_data)
    diffSquared[ifold] = np.power((trn_data - outputs[ifold]), 2) 

mean = np.mean(np.mean(diffSquared, axis=0), axis=0)
indexes = np.argsort(mean)[::-1]

for ifold in range(len(CVO[inovelty])):
    train_id, test_id = CVO[inovelty][ifold]
    
    # normalize data based in train set
    if trn_params.params['norm'] == 'mapstd':
        scaler = preprocessing.StandardScaler().fit(all_data[train_id,:])
    elif trn_params.params['norm'] == 'mapstd_rob':
        scaler = preprocessing.RobustScaler().fit(all_data[train_id,:])
    elif trn_params.params['norm'] == 'mapminmax':
        scaler = preprocessing.MinMaxScaler().fit(all_data[train_id,:])

    norm_data = scaler.transform(all_data[test_id,:])
   
    trgt_data = all_trgt
    trn_data = norm_data[trgt_data[test_id]==iclass]
    points = trn_data.shape[0]
    # Number of dimensions to analyse (even number is better!)
    num_dim = 4
    fig, m_ax = plt.subplots(figsize=(20,20),nrows=2, ncols=2)
    for choose_index in range(num_dim):  
        ax = plt.subplot(2,2,choose_index+1)
        ax.plot(trn_data[:,indexes[choose_index]][:points], outputs[ifold][:,indexes[choose_index]][:points],
                m_colors[iclass]+".")
        plt.tight_layout()
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
        ax.set_title('Input x Output - Dim %i'%(indexes[choose_index]),fontsize=18, fontweight='bold')
        plt.legend()
        plt.grid()    
#         #Save the figure
#         file_name = pict_results_path+'/'+current_analysis+'_first_layer_%i_neurons_%i_fold_'%(ineuron,ifold)+trn_params.get_params_str()+'.pdf'
#         plt.savefig(file_name)
print "Topology (%s)"%trn_params.get_params_str()

In [63]:
trn_data = all_data[all_trgt!=novelty_class]

# Treinamento da 2º Camada - Número de Neurônios definido

In [None]:
# Choose neurons topology

# hidden_neurons = [first_layer_dim, second_layer_dim]
hidden_neurons = [400, 250]

verbose = False

print trn_params.get_params_str()

start_time = time.time()
for inovelty, novelty_class in enumerate(np.unique(trgt)):
    trn_data = all_data[all_trgt!=novelty_class]
    trn_trgt = all_trgt[all_trgt!=novelty_class]

    trn_trgt[trn_trgt>novelty_class] = trn_trgt[trn_trgt>novelty_class]-1
    
    if inovelty != 0:
        print ''
    print 'Novelty class: %i'%inovelty
    
    # Array with folds to be trained in parallel
    folds = range(len(CVO[inovelty]))
    
    def trainFold(ifold):
        n_folds = len(CVO[inovelty])
        #print 'Neuron value: %i - fold %i'%(ineuron, ifold)
        TrainFunctions.SAENoveltyTrainFunction(data=trn_data,
                                                   trgt=trn_data,
                                                   inovelty=inovelty,
                                                   ifold=ifold,
                                                   n_folds=n_folds,
                                                   n_neurons=ineuron,
                                                   trn_params=trn_params,
                                                   save_path=results_path,
                                                   layer = 2, # Choose the layer to be trained
                                                   hidden_neurons = hidden_neurons,
                                                   verbose=verbose,
                                                   dev=development_flag)
    
    
    # Start Parallel processing
    p = multiprocessing.Pool(processes=num_processes)
    
    # To train on multiple cores sweeping the number of neurons
    results = p.map(trainFold, folds)
            
    p.close()
    p.join()        

end_time = time.time() - start_time
print "It took %.3f seconds to perform the training"%(end_time)

# Treinamento da 3ª Camada - Número de Neurônios definido

In [None]:
# Choose neurons topology

# hidden_neurons = [first_layer_dim, second_layer_dim]
hidden_neurons = [400, 250, 125]

verbose = False

print trn_params.get_params_str()

start_time = time.time()
for inovelty, novelty_class in enumerate(np.unique(trgt)):
    trn_data = all_data[all_trgt!=novelty_class]
    trn_trgt = all_trgt[all_trgt!=novelty_class]

    trn_trgt[trn_trgt>novelty_class] = trn_trgt[trn_trgt>novelty_class]-1
    
    if inovelty != 0:
        print ''
    print 'Novelty class: %i'%inovelty
    
    # Array with folds to be trained in parallel
    folds = range(len(CVO[inovelty]))
    
    def trainFold(ifold):
        n_folds = len(CVO[inovelty])
        #print 'Neuron value: %i - fold %i'%(ineuron, ifold)
        TrainFunctions.SAENoveltyTrainFunction(data=trn_data,
                                                   trgt=trn_data,
                                                   inovelty=inovelty,
                                                   ifold=ifold,
                                                   n_folds=n_folds,
                                                   n_neurons=ineuron,
                                                   trn_params=trn_params,
                                                   save_path=results_path,
                                                   layer = 3, # Choose the layer to be trained
                                                   hidden_neurons = hidden_neurons,
                                                   verbose=verbose,
                                                   dev=development_flag)
    
    
    # Start Parallel processing
    p = multiprocessing.Pool(processes=num_processes)
    
    # To train on multiple cores sweeping the number of neurons
    results = p.map(trainFold, folds)
            
    p.close()
    p.join()        

end_time = time.time() - start_time
print "It took %.3f seconds to perform the training"%(end_time)