# Projeto de Classificação para Marinha do Brasil

## Autor: Natanael Junior (natmourajr@gmail.com)

Laboratório de Processamento de Sinais - UFRJ

Laboratório de Tecnologia Sonar

Instituto de Pesquisas da Marinha - IPqM

# Bibliotecas e leitura dos dados
As bibliotecas necessárias para a inclusão

In [32]:
import os
import pickle
import numpy as np
import time

init_time = time.time()

from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.externals import joblib

m_time = time.time()
print 'Time to import all libraries: %1.5f seconds'%(m_time-init_time)

outputpath = os.environ['OUTPUTDATAPATH']
main_analysis_path = os.environ['SONAR_WORKSPACE']
log_analysis_path = os.environ['PACKAGE_OUTPUT']
result_analysis_path = os.environ['PACKAGE_OUTPUT']+'/PCDSingleClassSVM'
# Read data
# Check if LofarData has created...
m_time = time.time()


subfolder = '4classes'
n_pts_fft = 1024
decimation_rate = 3

if(not os.path.exists(outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
            subfolder,n_pts_fft,decimation_rate))):
    print outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
        subfolder,n_pts_fft,decimation_rate)+' doesnt exist...please create it'
    exit()
    
#Read lofar data
[data,class_labels] = joblib.load(outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
            subfolder,n_pts_fft,decimation_rate))
m_time = time.time()-m_time
print 'Time to read data file: %1.5f seconds'%m_time

Time to import all libraries: 0.00012 seconds
Time to read data file: 1.58412 seconds


# Processamento dos dados
Os dados encontram-se no formato do matlab, para isso precisam ser processados para o formato de python.

In [33]:
# Process data...
# create a full data vector
all_data = {};
all_trgt = {};

for iclass, class_label in enumerate(class_labels):
    for irun in range(len(data[iclass])):
        if len(all_data) == 0:
            all_data = data[iclass][irun]['Signal']
            all_trgt = (iclass)*np.ones(data[iclass][irun]['Signal'].shape[1])
        else:
            all_data = np.append(all_data,data[iclass][irun]['Signal'],axis=1)
            all_trgt = np.append(all_trgt,(iclass)*np.ones(data[iclass][irun]
                                                           ['Signal'].shape[1]),axis=0)
            
all_data = all_data.transpose()

# Balanceamento de Classes
Os dados encontram-se desbalanceados. Com isso, os classificadores podem se especializar em uma classe (gerando mais SVs para a mesma) e não se especializar em outras

Acessados em 21/12/2016

https://svds.com/learning-imbalanced-classes/

http://www.cs.utah.edu/~piyush/teaching/ImbalancedLearning.pdf

http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html

Para solucionar isso, a primeira solução é "criar" dados das classes com menos eventos de maneira aleatória. Outras soluções podem ser propostas posteriormente.

In [34]:
# Process data
# unbalanced data to balanced data with random data creation of small classes

# Same number of events in each class
qtd_events_biggest_class = 0
biggest_class_label = ''

for iclass, class_label in enumerate(class_labels):
    if sum(all_trgt==iclass) > qtd_events_biggest_class:
        qtd_events_biggest_class = sum(all_trgt==iclass)
        biggest_class_label = class_label
    print "Qtd event of %s is %i"%(class_label,sum(all_trgt==iclass))
print "\nBiggest class is %s with %i events"%(biggest_class_label,qtd_events_biggest_class)


balanced_data = {}
balanced_trgt = {}

from Functions import DataHandler as dh
m_datahandler = dh.DataHandlerFunctions()

for iclass, class_label in enumerate(class_labels):
    if len(balanced_data) == 0:
        class_events = all_data[all_trgt==iclass,:]
        balanced_data = m_datahandler.CreateEventsForClass(
            class_events,qtd_events_biggest_class-(len(class_events)))
        balanced_trgt = (iclass)*np.ones(qtd_events_biggest_class)
    else:
        balanced_data = np.append(balanced_data,
                                  (m_datahandler.CreateEventsForClass(
                    all_data[all_trgt==iclass,:],
                    qtd_events_biggest_class-sum(all_trgt==iclass))),
                                  axis=0)
        balanced_trgt = np.append(balanced_trgt,
                                  (iclass)*np.ones(qtd_events_biggest_class),axis=0)
        
all_data = balanced_data
all_trgt = balanced_trgt

Qtd event of ClassA is 4312
Qtd event of ClassB is 9781
Qtd event of ClassC is 3833
Qtd event of ClassD is 7918

Biggest class is ClassB with 9781 events
DataHandler Class: CreateEventsForClass
Original Size: (4312, 400)
DataHandler Class: CreateEventsForClass
Original Size: (9781, 400)
DataHandler Class: CreateEventsForClass
Original Size: (3833, 400)
DataHandler Class: CreateEventsForClass
Original Size: (7918, 400)


# Definições do treinamento
Nessa célula temos os parâmetros do treinamento a ser realizado. No log, deve ficar armazenada a data do treinamento para a reconstrução dos resultados

In [35]:
%time

from Functions import LogFunctions as log

# Create a entry in log file
m_log = log.LogInformation()
date = m_log.CreateLogEntry("NoveltyDetection",'PCDSingleClassSVM')

# Create a train information file
n_folds = 2
n_pcds = 2
norm = 'mapstd'
nu_values = np.array([0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
gamma_value = 0.1

train_info = {}
train_info['n_folds'] = n_folds
train_info['n_pcds'] = n_pcds
train_info['norm'] = norm
train_info['nu_values'] = nu_values
train_info['gamma_value'] = gamma_value

# divide data in train and test for novelty detection
for novelty_class, novelty_label in enumerate(class_labels):
    print 'Dividing data in trn and tst for novelty class: %s'%(novelty_label)
    CVO = cross_validation.StratifiedKFold(all_trgt[all_trgt!=novelty_class], n_folds)
    CVO = list(CVO)
    train_info['CVO_novelty_%s'%(novelty_label)] = CVO

train_info['preprocessing_done'] = False
train_info['train_done'] = False
train_info['results_done'] = False

train_info_name = result_analysis_path+'/train_info_files'+'/'+date+'_train_info.jbl'
joblib.dump([train_info],train_info_name,compress=9)


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.06 µs
Dividing data in trn and tst for novelty class: ClassA
Dividing data in trn and tst for novelty class: ClassB
Dividing data in trn and tst for novelty class: ClassC
Dividing data in trn and tst for novelty class: ClassD


['/home/natmourajr/Workspace/Doutorado/SonarAnalysis/Results/NoveltyDetection/PCDSingleClassSVM/train_info_files/2016_12_25_20_58_23_train_info.jbl']

In [36]:
# Read log files
from Functions import LogFunctions as log
mlog = log.LogInformation()
log_entries = mlog.RecoverLogEntries(package_name="NoveltyDetection")
print log_entries

{0: {'date': '2016_12_01_17_27_33', 'package': 'PCASingleClassSVM'}, 1: {'date': '2016_12_01_17_51_48', 'package': 'PCASingleClassSVM'}, 2: {'date': '2016_12_01_18_09_34', 'package': 'PCASingleClassSVM'}, 3: {'date': '2016_12_01_18_25_15', 'package': 'PCASingleClassSVM'}, 4: {'date': '2016_12_01_19_09_08', 'package': 'PCASingleClassSVM'}, 5: {'date': '2016_12_21_15_11_02', 'package': 'PCASingleClassSVM'}, 6: {'date': '2016_12_21_15_14_37', 'package': 'PCASingleClassSVM'}, 7: {'date': '2016_12_22_14_02_05', 'package': 'PCASingleClassSVM'}, 8: {'date': '2016_12_22_14_03_23', 'package': 'PCASingleClassSVM'}, 9: {'date': '2016_12_22_14_13_41', 'package': 'PCASingleClassSVM'}, 10: {'date': '2016_12_22_14_14_45', 'package': 'PCASingleClassSVM'}, 11: {'date': '2016_12_22_15_27_30', 'package': 'PCASingleClassSVM'}, 12: {'date': '2016_12_22_16_33_54', 'package': 'PCASingleClassSVM'}, 13: {'date': '2016_12_22_17_02_55', 'package': 'PCASingleClassSVM'}, 14: {'date': '2016_12_22_17_13_21', 'packag

In [37]:
# Read Information of Train Info File
choose_date = '2016_12_25_20_58_23'

for log_id, log_entry in enumerate(log_entries):
    if log_entries[log_id]['package'] != 'PCDSingleClassSVM':
        continue
    if log_entries[log_id]['date'] != choose_date:
        continue
    print 'Analysing train performed in %s and for %s analysis'%(
        log_entries[log_id]['date'],log_entries[log_id]['package'])
    
    # Read train info file
    train_info_name = '%s/train_info_files/%s_train_info.jbl'%(
        result_analysis_path,log_entries[log_id]['date'])
    
    [train_info] = joblib.load(train_info_name)
    print 'PCASingleClassSVM Train Info File'
    print 'Date: %s'%(choose_date)
    print 'Number of Folds: %i'%(train_info['n_folds'])
    print 'Number of Used PCDs: %i'%(train_info['n_pcds'])
    print 'Normalization Method: %s'%(train_info['norm'])
    print 'Gamma Value: %1.3f'%(train_info['gamma_value'])
    print 'Nu Value(s): '
    print train_info['nu_values']
    if train_info['preprocessing_done']:
        print 'Preprocessing Done: True'
    else:
        print 'Preprocessing Done: False'
    if train_info['train_done']:
        print 'Train Done: True'
    else:
        print 'Train Done: False'
    if train_info['results_done']:
        print 'Extract Results: True'
    else:
        print 'Extract Results: False'

Analysing train performed in 2016_12_25_20_58_23 and for PCDSingleClassSVM analysis
PCASingleClassSVM Train Info File
Date: 2016_12_25_20_58_23
Number of Folds: 2
Number of Used PCDs: 2
Normalization Method: mapstd
Gamma Value: 0.100
Nu Value(s): 
[ 0.001  0.1    0.2    0.3    0.4    0.5    0.6    0.7    0.8    0.9  ]
Preprocessing Done: False
Train Done: False
Extract Results: False


# Preprocessamento - PCD
Como a dimensionalidade dos dados é alta (400 dimensões), um pré-processamento se faz necessário para reduzir as dimensões das entradas e tornar o modelo menos complexo. Aqui, o pré-processamento utilizado é a PCD (análise de componentes principais de discriminação). Existem alguns tipos de extração de PCDs. Cada extração visa uma característica diferente (uma visa a extração de componentes necessariamente ortogonais entre si, por exemplo). 

## PCD por Deflação
definir

## PCD por Cooperativa
definir

In [38]:
from sklearn import cross_validation
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
import keras.callbacks as callbacks
from keras.utils import np_utils
from keras.layers import Merge


def pcdc_extractor(inputdata, targetdata, trn_params=None):
    ''' 
        This function extracts the Cooperative Principal Components of Discrimination of a Dataset
        
        Parameters:
            inputdata: dataset with inputs
            
            targetdata: each class -> an integer
            
            trn_params: train parameters
            
            trn_params['n_folds'] = number of cross validation folds
            trn_params['n_inits'] = number of initializations
            trn_params['n_pcds'] = number of PCDs to be extracted
            trn_params['norm'] = normalization
            trn_params['learning_rate'] = learning rate
            trn_params['learning_decay'] = learning rate decay
            trn_params['momentum'] = momentum
            trn_params['nesterov'] = nesterov momentum
            trn_params['train_verbose'] = train verbose
            trn_params['n_epochs'] = number of epochs
            trn_params['batch_size'] = batch size
        
    '''
    
    if trn_params == None:
        trn_params = {}
        trn_params['n_folds'] = 2
        trn_params['n_inits'] = 2
        trn_params['n_pcds'] = 2
        trn_params['norm'] = 'none'
        trn_params['learning_rate'] = 0.01
        trn_params['learning_decay'] = 1e-6
        trn_params['momentum'] = 0.3
        trn_params['nesterov'] = True
        trn_params['train_verbose'] = False
        trn_params['n_epochs'] = 300
        trn_params['batch_size'] = 8
        trn_params['CVO'] = None

    print 'PCD Cooperative Extractor'
    print 'trn_params: ',trn_params
    
    # trained classifiers
    classifiers = {}
    trn_desc = {}
    pcds = {}
    
    if trn_params['CVO'] == None:
        CVO = cross_validation.StratifiedKFold(targetdata, trn_params['n_folds'])
        CVO = list(CVO)
    else:
        CVO = trn_params['CVO']
    
    # from each class an integer -> target max sparse
    targetdata_sparse = np_utils.to_categorical(targetdata)
    
    for ifold in range(trn_params['n_folds']):
        train_id, test_id = CVO[ifold]

        # normalize data based in train set
        if trn_params['norm'] == 'mapstd':
            scaler = preprocessing.StandardScaler().fit(inputdata[train_id,:])
        elif trn_params['norm'] == 'mapstd_rob':
            scaler = preprocessing.RobustScaler().fit(inputdata[train_id,:])
        elif trn_params['norm'] == 'mapminmax':
            scaler = preprocessing.MinMaxScaler().fit(inputdata[train_id,:])
        
        if trn_params['norm'] != "none":
            norm_inputdata = scaler.transform(inputdata)
        else:
            norm_inputdata = inputdata
         
        
        classifiers[ifold] = {}
        trn_desc[ifold] = {}
        pcds[ifold] = {}
        
        for ipcd in range(trn_params['n_pcds']):
            best_init = 0
            best_loss = 999
            if ipcd == 0:
                # first pcd - random init
                for i_init in range(trn_params['n_inits']):
                    # create the model
                    model = Sequential()
                    
                    # add a linear layer to isolate the input of NN model
                    model.add(Dense(norm_inputdata.shape[1],
                                input_dim=norm_inputdata.shape[1], 
                                init='identity',trainable=False))
                    model.add(Activation('linear'))
                    
                    # add a non-linear single neuron layer to compress all information
                    model.add(Dense(1, input_dim=norm_inputdata.shape[1], init='uniform'))
                    model.add(Activation('tanh'))
                    
                    # add a non-linear output layer with max sparse target shape
                    model.add(Dense(targetdata_sparse.shape[1], init='uniform')) 
                    model.add(Activation('tanh'))
                    
                    # creating a optimization function using steepest gradient
                    sgd = SGD(lr=trn_params['learning_rate'],
                              decay=trn_params['learning_decay'],
                              momentum=trn_params['momentum'],
                              nesterov=trn_params['nesterov'])
                    
                    # compile the model
                    model.compile(loss='mean_squared_error', 
                                  optimizer=sgd,
                                  metrics=['accuracy','mean_squared_error'])
                    
                    # early stopping to avoid overtraining
                    earlyStopping = callbacks.EarlyStopping(
                        monitor='val_loss', patience=25,
                        verbose=0, mode='auto')
                    
                    # Train model
                    init_trn_desc = model.fit(norm_inputdata[train_id], targetdata_sparse[train_id],
                                              nb_epoch=trn_params['n_epochs'], 
                                              batch_size=trn_params['batch_size'],
                                              callbacks=[earlyStopping], 
                                              verbose=trn_params['train_verbose'],
                                              validation_data=(norm_inputdata[test_id],
                                                               targetdata_sparse[test_id]),
                                              shuffle=True)
                    
                    if np.min(init_trn_desc.history['val_loss']) < best_loss:
                        best_init = i_init
                        best_loss = np.min(init_trn_desc.history['val_loss'])
                        classifiers[ifold][ipcd] = model
                        trn_desc[ifold][ipcd] = init_trn_desc
                        pcds[ifold][ipcd] = model.layers[2].get_weights()[0]
                    
                    print ('Fold: %i of %i - PCD: %i of %i - Init: %i of %i - finished with val cost: %1.3f'%
                           (ifold+1,trn_params['n_folds'],
                            ipcd+1,trn_params['n_pcds'],
                            i_init+1,trn_params['n_inits'],
                            best_loss
                           ))
                    
            else: # ipcd != 0
                # from second pcd to the end - freeze previous neurons and create a new neuron
                for i_init in range(trn_params['n_inits']):
                    # create the model
                    model = Sequential()
                    
                    # I removed the linear layer to allow freeze!!!
                    
                    # add a non-linear freeze previous extracted pcd
                    freeze_layer = Sequential()
                    
                    freeze_layer.add(Dense(ipcd, input_dim=norm_inputdata.shape[1],trainable=False))
                    
                    weights = freeze_layer.layers[0].get_weights()
                    
                    for i_old_pcd in range(ipcd):
                        for idim in range(norm_inputdata.shape[1]):
                            weights[0][idim,i_old_pcd] = pcds[ifold][i_old_pcd][idim]
                    
                    freeze_layer.layers[0].set_weights(weights)
                    
                    # add a non-linear no-freeze new neuron
                    non_freeze_layer = Sequential()
                    non_freeze_layer.add(Dense(1, input_dim=norm_inputdata.shape[1]))
                 
                    # merge everything
                    merged = Merge([freeze_layer, non_freeze_layer], mode='concat')
                    model.add(merged)
                    
                    # add a non-linear output layer with max sparse target shape
                    model.add(Dense(targetdata_sparse.shape[1], init='uniform')) 
                    model.add(Activation('tanh'))
                    
                    # creating a optimization function using steepest gradient
                    sgd = SGD(lr=trn_params['learning_rate'],
                              decay=trn_params['learning_decay'],
                              momentum=trn_params['momentum'],
                              nesterov=trn_params['nesterov'])
                    
                    # compile the model
                    model.compile(loss='mean_squared_error', 
                                  optimizer=sgd,
                                  metrics=['accuracy','mean_squared_error'])
                    
                    # early stopping to avoid overtraining
                    earlyStopping = callbacks.EarlyStopping(
                        monitor='val_loss', patience=25,
                        verbose=0, mode='auto')
                    
                    # Train model
                    init_trn_desc = model.fit([norm_inputdata[train_id], 
                                               norm_inputdata[train_id]], 
                                              targetdata_sparse[train_id],
                                              nb_epoch=trn_params['n_epochs'], 
                                              batch_size=trn_params['batch_size'],
                                              callbacks=[earlyStopping], 
                                              verbose=trn_params['train_verbose'],
                                              validation_data=([norm_inputdata[test_id], 
                                                                norm_inputdata[test_id]],
                                                               targetdata_sparse[test_id]),
                                              shuffle=True)
                    
                    if np.min(init_trn_desc.history['val_loss']) < best_loss:
                        best_init = i_init
                        best_loss = np.min(init_trn_desc.history['val_loss'])
                        classifiers[ifold][ipcd] = model
                        trn_desc[ifold][ipcd] = init_trn_desc
                        pcds[ifold][ipcd] = model.layers[0].layers[1].get_weights()[0]
                        
                    print ('Fold: %i of %i - PCD: %i of %i - Init: %i of %i - finished with val cost: %1.3f'%
                           (ifold+1,trn_params['n_folds'],
                            ipcd+1,trn_params['n_pcds'],
                            i_init+1,trn_params['n_inits'],
                            best_loss
                           ))
                    
    # add cross-validation information in train desc.
    trn_desc['CVO'] = CVO                    
                    
    return [pcds,classifiers,trn_desc]

In [None]:
# PCD extraction
%time

pcds = {}

for log_id, log_entry in enumerate(log_entries):
    if log_entries[log_id]['package'] != 'PCDSingleClassSVM':
        continue
    if log_entries[log_id]['date'] != choose_date:
        continue
    print 'PCD extraction performed in %s and for %s analysis'%(
        log_entries[log_id]['date'],log_entries[log_id]['package'])
    
    # Read train info file
    train_info_name = '%s/train_info_files/%s_train_info.jbl'%(
        result_analysis_path,log_entries[log_id]['date'])
    
    [train_info] = joblib.load(train_info_name)
    
    # saving time
    #if train_info['preprocessing_done']:
    #    print 'Preprocessing done, just analyse it'
    #    continue

    for novelty_class, novelty_label in enumerate(class_labels):
        pcds[novelty_class] = {}
        print 'Extracting PCD for novelty %s'%(novelty_label)
        
        known_data = all_data[all_trgt!=novelty_class,:]
        known_trgt = all_trgt[all_trgt!=novelty_class]
        known_trgt[known_trgt>novelty_class] = known_trgt[known_trgt>novelty_class]-1
        
        # Extract PCD Cooperative
        trn_params = {}
        trn_params['n_folds'] = train_info['n_folds']
        trn_params['n_inits'] = 5
        trn_params['n_pcds'] = train_info['n_pcds']
        trn_params['norm'] = train_info['norm']
        trn_params['learning_rate'] = 0.01
        trn_params['learning_decay'] = 1e-4
        trn_params['momentum'] = 0.9
        trn_params['nesterov'] = True
        trn_params['train_verbose'] = False
        trn_params['n_epochs'] = 1000
        trn_params['batch_size'] = 9
        trn_params['CVO'] = train_info['CVO_novelty_%s'%(novelty_label)]
            
        [pcd,pcd_classifiers,pcd_trn_desc] = pcdc_extractor(known_data,known_trgt,trn_params)            
        pcds[novelty_class] = pcd
        
# saving file
pcd_file_path = result_analysis_path+'/result_files'+'/'+date+'_pcd_file.jbl'

if pcds != {}:
    joblib.dump([pcds],pcd_file_path,compress=9)

train_info_name = result_analysis_path+'/train_info_files'+'/'+choose_date+'_train_info.jbl'

train_info['preprocessing_done'] = True
joblib.dump([train_info],train_info_name,compress=9)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.1 µs
PCD extraction performed in 2016_12_25_20_58_23 and for PCDSingleClassSVM analysis
Extracting PCD for novelty ClassA
PCD Cooperative Extractor
trn_params:  {'nesterov': True, 'learning_rate': 0.01, 'n_inits': 5, 'batch_size': 9, 'n_epochs': 1000, 'train_verbose': False, 'CVO': [(array([ 4891,  4892,  4893, ..., 29340, 29341, 29342]), array([    0,     1,     2, ..., 24450, 24451, 24452])), (array([    0,     1,     2, ..., 24450, 24451, 24452]), array([ 4891,  4892,  4893, ..., 29340, 29341, 29342]))], 'learning_decay': 0.0001, 'momentum': 0.9, 'n_folds': 2, 'norm': 'mapstd', 'n_pcds': 2}
Fold: 1 of 2 - PCD: 1 of 2 - Init: 1 of 5 - finished with val cost: 0.132
Fold: 1 of 2 - PCD: 1 of 2 - Init: 2 of 5 - finished with val cost: 0.131
