# Projeto de Classificação para Marinha do Brasil

## Autor: Natanael Junior (natmourajr@gmail.com)

Laboratório de Processamento de Sinais - UFRJ

Laboratório de Tecnologia Sonar

Instituto de Pesquisas da Marinha - IPqM

# Bibliotecas e leitura dos dados
As bibliotecas necessárias para a inclusão

In [1]:
import os
import pickle
import numpy as np
import time

from sklearn.decomposition import PCA
from sklearn.externals import joblib

init_time = time.time()

m_time = time.time()
print 'Time to import all libraries: '+str(m_time-init_time)+' seconds'

analysis_name = 'SingleClassSVM'
data_path = os.getenv('OUTPUTDATAPATH')
results_path = os.getenv('PACKAGE_NAME')

pict_results_path = results_path+'/'+analysis_name+'/picts'
files_results_path = results_path+'/'+analysis_name+'/output_files'

# Read data
# Check if LofarData has created...
m_time = time.time()

database = '4classes'
n_pts_fft = 1024
decimation_rate = 3
spectrum_bins_left = 400
development_flag = True
development_events = 101

if not os.path.exists('%s/%s/lofar_data_file_fft_%i_decimation_%i_spectrum_left_%i.jbl'%
                      (data_path,database,n_pts_fft,decimation_rate,spectrum_bins_left)):
    print 'No Files in %s/%s\n'%(data_path,database)
else:
    #Read lofar data
    [data,trgt,class_labels] = joblib.load('%s/%s/lofar_data_file_fft_%i_decimation_%i_spectrum_left_%i.jbl'%
                                           (data_path,database,n_pts_fft,decimation_rate,spectrum_bins_left))


    m_time = time.time()-m_time
    print 'Time to read data file: '+str(m_time)+' seconds'

    # correct format
    all_data = data
    all_trgt = trgt

    # turn targets in sparse mode
    from keras.utils import np_utils
    trgt_sparse = np_utils.to_categorical(all_trgt.astype(int))
    
    # Process data
    # unbalanced data to balanced data with random data creation of small classes

    # Same number of events in each class
    qtd_events_biggest_class = 0
    biggest_class_label = ''

    for iclass, class_label in enumerate(class_labels):
        if sum(all_trgt==iclass) > qtd_events_biggest_class:
            qtd_events_biggest_class = sum(all_trgt==iclass)
            biggest_class_label = class_label
        print "Qtd event of %s is %i"%(class_label,sum(all_trgt==iclass))
    print "\nBiggest class is %s with %i events"%(biggest_class_label,qtd_events_biggest_class)


    balanced_data = {}
    balanced_trgt = {}

    from Functions import DataHandler as dh
    m_datahandler = dh.DataHandlerFunctions()

    for iclass, class_label in enumerate(class_labels):
        if development_flag:
            class_events = all_data[all_trgt==iclass,:]
            if len(balanced_data) == 0:
                balanced_data = class_events[0:development_events,:]
                balanced_trgt = (iclass)*np.ones(development_events)
            else:
                balanced_data = np.append(balanced_data,
                                          class_events[0:development_events,:], 
                                          axis=0)
                balanced_trgt = np.append(balanced_trgt,(iclass)*np.ones(development_events))
        else:
            if len(balanced_data) == 0:
                class_events = all_data[all_trgt==iclass,:]
                balanced_data = m_datahandler.CreateEventsForClass(
                    class_events,qtd_events_biggest_class-(len(class_events)))
                balanced_trgt = (iclass)*np.ones(qtd_events_biggest_class)
            else:
                class_events = all_data[all_trgt==iclass,:]
                created_events = (m_datahandler.CreateEventsForClass(all_data[all_trgt==iclass,:],
                                                                     qtd_events_biggest_class-
                                                                     (len(class_events))))
                balanced_data = np.append(balanced_data,created_events,axis=0)
                balanced_trgt = np.append(balanced_trgt,
                                          (iclass)*np.ones(created_events.shape[0]),axis=0)
        
    all_data = balanced_data
    all_trgt = balanced_trgt

    # turn targets in sparse mode
    from keras.utils import np_utils
    trgt_sparse = np_utils.to_categorical(all_trgt.astype(int))

Time to import all libraries: 7.9870223999e-05 seconds
Time to read data file: 1.74252605438 seconds


Using TensorFlow backend.


Qtd event of 0 is 12939
Qtd event of 1 is 29352
Qtd event of 2 is 11510
Qtd event of 3 is 23760

Biggest class is 1 with 29352 events


In [4]:
from Functions import LogFunctions as log

# Create a entry in log file
m_log = log.LogInformation()
# = m_log.CreateLogEntry("NoveltyDetection",analysis_name)
log_entries = m_log.RecoverLogEntries(package_name="NoveltyDetection")
print log_entries

{0: {'date': '2017_07_07_19_58_38', 'package': 'PCASingleClassSVM'}, 1: {'date': '2017_07_11_19_20_57', 'package': 'SingleClassSVM'}, 2: {'date': '2017_07_18_15_51_46', 'package': 'SingleClassSVM'}}


In [2]:
from Functions import NoveltyDetectionAnalysis as novelty_detection
obj = novelty_detection.SVMNoveltyDetection(name=analysis_name,
                                           preproc_path=files_results_path,
                                           train_path=files_results_path,
                                           anal_path=files_results_path)
date = '2017_07_18_15_51_46'
obj.trn_info = novelty_detection.TrnInformation(date=date, n_folds=4)
obj.trn_info.SplitTrainSet(all_trgt)

#obj.trn_info.Print()

#[data_proc, trgt_proc] = obj.preprocess(all_data,trgt_sparse,novelty_class=0,trn_info=obj.trn_info,fold=0)
#svm_obj = obj.train(all_data,trgt_sparse, novelty_class=3, nu_value=0.1, trn_info=obj.trn_info, fold=0)

#obj.analysis_output_hist(all_data, trgt_sparse, trn_info=obj.trn_info, nu_value=0.01, fold=1)
obj.analysis_nu_sweep(all_data, trgt_sparse, obj.trn_info, min_nu=0.01, max_nu=0.1, nu_step=0.01, num_cores=4)

SVMNoveltyDetection analysis nu sweep function


ValueError: need more than 5 values to unpack

In [45]:
# develop area

from Functions import NoveltyDetectionAnalysis as novelty_detection


def train_function_for_more_cores(fold):
        [data, trgt, nu_values, trn_info, n_folds] = joblib.load('./buffer.jbl')
        qtd_folds = n_folds
        qtd_classes = trgt.shape[1]
        qtd_nu = nu_values.shape[0]
        
        # to be easy to compare
        trgt_num = trgt.argmax(axis=1)
        
        # fig of merit
        eff_known_class = np.zeros([qtd_classes,qtd_classes-1,qtd_nu])
        tri_known_class = np.zeros([qtd_classes,qtd_nu])
        eff_novelty = np.zeros([qtd_classes,qtd_nu])
    
        for i_novelty_class in range(trgt.shape[1]):
            for i_nu, nu_value in enumerate(nu_values):
                print 'Fold: %i - Novelty: %i/%i - Nu: %i/%i'%(fold+1,
                                                               i_novelty_class+1,
                                                               qtd_classes,
                                                               i_nu+1,
                                                               qtd_nu)
                classifiers = obj.train(data, trgt, 
                                        novelty_class=i_novelty_class,
                                        trn_info=trn_info,
                                        nu_value=nu_value,
                                        fold=fold)
                for iclass in range(len(classifiers)):
                    if not iclass == i_novelty_class:
                        output = classifiers[iclass].predict(data)
                        eff_aux = float(sum(output[trgt_num==iclass]==1))/float(sum(trgt_num==iclass))
                        eff_known_class[i_novelty_class,iclass-(iclass>i_novelty_class),i_nu] = eff_aux
                    else:
                        # novelty detection
                        output = classifiers[i_novelty_class].predict(data)
                        eff_aux = float(sum(output[trgt_num==i_novelty_class]==-1))/float(sum(trgt_num==i_novelty_class))
                        eff_novelty[i_novelty_class,i_nu] = eff_aux
                                
                        # trigger
                        eff_aux = float(sum(output[trgt_num!=i_novelty_class]==1))/float(sum(trgt_num!=i_novelty_class))
                        tri_known_class[i_novelty_class,i_nu] = eff_aux
                        
        return [eff_known_class,tri_known_class,eff_novelty]

n_folds = 4
min_nu=0.1
max_nu=0.9 
nu_step=0.1 
num_cores=0

date = '2017_07_11_19_20_57'



obj = novelty_detection.SVMNoveltyDetection(name=analysis_name,
                                            preproc_path=files_results_path,
                                            train_path=files_results_path,
                                            anal_path=files_results_path)

obj.trn_info = novelty_detection.TrnInformation(date=date, n_folds=n_folds)
obj.trn_info.SplitTrainSet(all_trgt)

data = all_data
trgt = trgt_sparse

# checar se a analise ja foi feita
min_nu_str = ("%.5f"%(min_nu)).replace('.','_')
max_nu_str = ("%.5f"%(max_nu)).replace('.','_')
step_nu_str = ("%.5f"%(nu_step)).replace('.','_')
file_name = '%s/%s_%s_analysis_nu_sweep_min_%s_max_%s_step_%s.jbl'%(files_results_path,
                                                                    date,
                                                                    'develop',
                                                                    min_nu_str,
                                                                    max_nu_str,
                                                                    step_nu_str)

init_time = time.time()
if not os.path.exists(file_name):
    nu_values = np.arange(min_nu,max_nu+nu_step/2.0,nu_step)
    qtd_folds = n_folds
    qtd_classes = trgt.shape[1]
    qtd_nu = nu_values.shape[0]
    
    # to be easy to compare
    trgt_num = trgt.argmax(axis=1)
    
    # Figures of Merit
    # qtd_classes -1 = all known classes
    # qtd_classes = possible nolvety classes
    n_fig_merit = 3
    eff_known_class = np.zeros([qtd_folds,qtd_classes,qtd_classes-1,qtd_nu])
    tri_known_class = np.zeros([qtd_folds,qtd_classes,qtd_nu])
    eff_novelty = np.zeros([qtd_folds,qtd_classes,qtd_nu])
    
    
    # criar um arquivo temp para guardar os parametros
    args = './buffer.jbl'
    joblib.dump([data,trgt,nu_values,obj.trn_info,n_folds],'./buffer.jbl',compress=9)
    p = Pool(4)
    out =  p.map(train_function_for_more_cores,range(n_folds))
    
    for i in range(n_folds):
        [eff_known_class[i], tri_known_class[i], eff_novelty[i]] = out[i]
            
    
            
#     for ifold in range(n_folds):
#         for i_novelty_class in range(trgt.shape[1]):
#             for i_nu, nu_value in enumerate(nu_values):
#                 print 'Fold: %i/%i - Novelty: %i/%i - Nu: %i/%i'%(ifold+1,
#                                                                   qtd_folds,
#                                                                   i_novelty_class+1,
#                                                                   qtd_classes,
#                                                                   i_nu+1,
#                                                                   qtd_nu)
#                 classifiers = obj.train(data, trgt, 
#                                         novelty_class=i_novelty_class,
#                                         trn_info=obj.trn_info,
#                                         nu_value=nu_value,
#                                         fold=ifold)
#                 for iclass in range(len(classifiers)):
#                     if not iclass == i_novelty_class:
#                         output = classifiers[iclass].predict(data)
#                         eff_aux = float(sum(output[trgt_num==iclass]==1))/float(sum(trgt_num==iclass))
#                         eff_known_class[ifold,i_novelty_class,iclass-(iclass>i_novelty_class),i_nu] = eff_aux
#                     else:
#                         # novelty detection
#                         output = classifiers[i_novelty_class].predict(data)
#                         eff_aux = float(sum(output[trgt_num==i_novelty_class]==-1))/float(sum(trgt_num==i_novelty_class))
#                         eff_novelty[ifold,i_novelty_class,i_nu] = eff_aux
                                
#                         # trigger
#                         eff_aux = float(sum(output[trgt_num!=i_novelty_class]==1))/float(sum(trgt_num!=i_novelty_class))
#                         tri_known_class[ifold,i_novelty_class,i_nu] = eff_aux

    # removendo o arquivo temp
    os.remove('./buffer.jbl')                    
    joblib.dump([nu_values,eff_known_class,eff_novelty,tri_known_class],file_name,compress=9)
else:
    [nu_values,eff_known_class,eff_novelty,tri_known_class] = joblib.load(file_name)
m_time = time.time()
print 'Time to run all train: '+str(m_time-init_time)+' seconds'

Fold: 1 - Novelty: 1/4 - Nu: 1/9
Fold: 2 - Novelty: 1/4 - Nu: 1/9
Fold: 3 - Novelty: 1/4 - Nu: 1/9
SVMNoveltyDetection train function
SVMNoveltyDetection train function
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 4 - Novelty: 1/4 - Nu: 1/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
SVMNoveltyDetection preprocess function
SVMNoveltyDetection preprocess function
Fold: 2 - Novelty: 1/4 - Nu: 2/9
SVMNoveltyDetection train function
Fold: 1 - Novelty: 1/4 - Nu: 2/9
SVMNoveltyDetection preprocess function
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 3 - Novelty: 1/4 - Nu: 2/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 4 - Novelty: 1/4 - Nu: 2/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 2 - Novelty: 1/4 - Nu: 3/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 1 - Novelty: 1/4 - Nu:

Fold: 1 - Novelty: 3/4 - Nu: 2/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 3 - Novelty: 3/4 - Nu: 1/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 4 - Novelty: 3/4 - Nu: 2/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 2 - Novelty: 3/4 - Nu: 3/9
SVMNoveltyDetection train function
Fold: 1 - Novelty: 3/4 - Nu: 3/9
SVMNoveltyDetection preprocess function
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 3 - Novelty: 3/4 - Nu: 2/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 4 - Novelty: 3/4 - Nu: 3/9
SVMNoveltyDetection train function
SVMNoveltyDetection preprocess function
Fold: 3 - Novelty: 3/4 - Nu: 3/9
Fold: 2 - Novelty: 3/4 - Nu: 4/9
SVMNoveltyDetection train function
SVMNoveltyDetection train function
Fold: 1 - Novelty: 3/4 - Nu: 4/9
SVMNoveltyDetection preprocess function
SVMNoveltyDetection train function


In [3]:
range(obj.trn_info.n_folds)

[0, 1, 2, 3]

In [6]:
from multiprocessing import Pool

class Person(object):
    def __init__(self):
        self.name = 'Weizhong Tu'

    def calc(self, x):
        print x
        return x ** 5


def func(x, p=Person()):
    return p.calc(x)


pool = Pool(4)
print pool.map(func, range(10))

2
1
0
3
5
4
6
7
8
9
[0, 1, 32, 243, 1024, 3125, 7776, 16807, 32768, 59049]


In [50]:
for i in range(n_folds):
        [eff_known_class[i],tri_known_class[i],eff_novelty[i]] = out[i]