# Projeto de Classificação para Marinha do Brasil

## Autor: Natanael Junior (natmourajr@gmail.com)

Laboratório de Processamento de Sinais - UFRJ

Laboratório de Tecnologia Sonar

Instituto de Pesquisas da Marinha - IPqM

# Bibliotecas e leitura dos dados
As bibliotecas necessárias para a inclusão

In [None]:
import os
import pickle
import numpy as np
import time

init_time = time.time()

from sklearn.externals import joblib

m_time = time.time()
print 'Time to import all libraries: %1.5f seconds'%(m_time-init_time)

outputpath = os.environ['OUTPUTDATAPATH']
main_analysis_path = os.environ['SONAR_WORKSPACE']
log_analysis_path = os.environ['PACKAGE_OUTPUT']
result_analysis_path = os.environ['PACKAGE_OUTPUT']+'/NeuralNetwork'
# Read data
# Check if LofarData has created...
m_time = time.time()


subfolder = '4classes_old'
n_pts_fft = 1024
decimation_rate = 3

if(not os.path.exists(outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
            subfolder,n_pts_fft,decimation_rate))):
    print outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
        subfolder,n_pts_fft,decimation_rate)+' doesnt exist...please create it'
    exit()
    
#Read lofar data
[data,class_labels] = joblib.load(outputpath+'/'+'LofarData_%s_%i_fft_pts_%i_decimation_rate.jbl'%(
            subfolder,n_pts_fft,decimation_rate))

m_time = time.time()-m_time
print 'Time to read data file: %1.5f seconds'%m_time

Time to import all libraries: 0.10603 seconds
/home/vinicius.mello/Workspace/SonarAnalysis/Results/LofarData_4classes_old_1024_fft_pts_3_decimation_rate.jbl doesnt exist...please create it


IOError: [Errno 2] No such file or directory: '/home/vinicius.mello/Workspace/SonarAnalysis/Results/LofarData_4classes_old_1024_fft_pts_3_decimation_rate.jbl'

# Processamento dos dados
Os dados encontram-se no formato do matlab, para isso precisam ser processados para o formato de python.

In [None]:
# Process data...
# create a full data vector
all_data = {};
all_trgt = {};

for iclass, class_label in enumerate(class_labels):
    for irun in range(len(data[iclass])):
        if len(all_data) == 0:
            all_data = data[iclass][irun]['Signal']
            all_trgt = (iclass)*np.ones(data[iclass][irun]['Signal'].shape[1])
        else:
            all_data = np.append(all_data,data[iclass][irun]['Signal'],axis=1)
            all_trgt = np.append(all_trgt,(iclass)*np.ones(data[iclass][irun]
                                                           ['Signal'].shape[1]),axis=0)
            
all_data = all_data.transpose()

# Balanceamento de Classes
Os dados encontram-se desbalanceados. Com isso, os classificadores podem se especializar em uma classe (gerando mais SVs para a mesma) e não se especializar em outras

Acessados em 21/12/2016

https://svds.com/learning-imbalanced-classes/

http://www.cs.utah.edu/~piyush/teaching/ImbalancedLearning.pdf

http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html

Para solucionar isso, a primeira solução é "criar" dados das classes com menos eventos de maneira aleatória. Outras soluções podem ser propostas posteriormente.

In [None]:
# Process data
# unbalanced data to balanced data with random data creation of small classes

# Same number of events in each class
qtd_events_biggest_class = 0
biggest_class_label = ''

for iclass, class_label in enumerate(class_labels):
    if sum(all_trgt==iclass) > qtd_events_biggest_class:
        qtd_events_biggest_class = sum(all_trgt==iclass)
        biggest_class_label = class_label
    print "Qtd event of %s is %i"%(class_label,sum(all_trgt==iclass))
print "\nBiggest class is %s with %i events"%(biggest_class_label,qtd_events_biggest_class)


balanced_data = {}
balanced_trgt = {}

from Functions import DataHandler as dh
m_datahandler = dh.DataHandlerFunctions()

for iclass, class_label in enumerate(class_labels):
    if len(balanced_data) == 0:
        class_events = all_data[all_trgt==iclass,:]
        balanced_data = m_datahandler.CreateEventsForClass(
            class_events,qtd_events_biggest_class-(len(class_events)))
        balanced_trgt = (iclass)*np.ones(qtd_events_biggest_class)
    else:
        balanced_data = np.append(balanced_data,
                                  (m_datahandler.CreateEventsForClass(
                    all_data[all_trgt==iclass,:],
                    qtd_events_biggest_class-sum(all_trgt==iclass))),
                                  axis=0)
        balanced_trgt = np.append(balanced_trgt,
                                  (iclass)*np.ones(qtd_events_biggest_class),axis=0)
        
all_data = balanced_data
all_trgt = balanced_trgt

In [None]:
%time

from sklearn import cross_validation
from Functions import LogFunctions as log

# Create a entry in log file
m_log = log.LogInformation()
date = m_log.CreateLogEntry("Classification",'EspClassPCDNeuralNetwork')

# Create a train information file
n_folds = 2
n_pcds = 2
norm = 'mapstd'

train_info = {}
train_info['n_folds'] = n_folds
train_info['n_pcds'] = n_pcds
train_info['norm'] = norm

# divide data in train and test for novelty detection
print 'Dividing data in trn and tst'
CVO = cross_validation.StratifiedKFold(all_trgt, n_folds)
CVO = list(CVO)
train_info['CVO'] = CVO

train_info['preprocessing_extraction_done'] = False
train_info['preprocessing_analysis_done'] = False
train_info['train_done'] = False
train_info['results_done'] = False

train_info_name = result_analysis_path+'/train_info_files'+'/'+date+'_train_info.jbl'
joblib.dump([train_info],train_info_name,compress=9)

In [None]:
# Read log files
from Functions import LogFunctions as log
mlog = log.LogInformation()
log_entries = mlog.RecoverLogEntries(package_name="Classification")
print log_entries

In [None]:
# Read Information of Train Info File
choose_date = '2017_05_09_17_27_30'

for log_id, log_entry in enumerate(log_entries):
    if log_entries[log_id]['package'] != 'EspClassPCDNeuralNetwork':
        continue
    if log_entries[log_id]['date'] != choose_date:
        continue
    print 'Analysing train performed in %s and for %s analysis'%(
        log_entries[log_id]['date'],log_entries[log_id]['package'])
    
    # Read train info file
    train_info_name = '%s/train_info_files/%s_train_info.jbl'%(
        result_analysis_path,log_entries[log_id]['date'])
    
    [train_info] = joblib.load(train_info_name)
    print 'NeuralNetwork Train Info File'
    print 'Date: %s'%(choose_date)
    print 'Number of Folds: %i'%(train_info['n_folds'])
    print 'Number of Used PCDs: %i'%(train_info['n_pcds'])
    print 'Normalization Method: %s'%(train_info['norm'])
    if train_info['preprocessing_extraction_done']:
        print 'Preprocessing Extraction Done: True'
    else:
        print 'Preprocessing Extraction Done: False'
    if train_info['preprocessing_analysis_done']:
        print 'Preprocessing Analysis Done: True'
    else:
        print 'Preprocessing Analysis Done: False'
    if train_info['train_done']:
        print 'Train Done: True'
    else:
        print 'Train Done: False'
    if train_info['results_done']:
        print 'Extract Results: True'
    else:
        print 'Extract Results: False'

In [None]:
# PCD extraction
%time

from Functions import PreProcessing as preproc
from sklearn import preprocessing
from keras.utils import np_utils




for log_id, log_entry in enumerate(log_entries):
    if log_entries[log_id]['package'] != 'EspClassPCDNeuralNetwork':
        continue
    if log_entries[log_id]['date'] != choose_date:
        continue
    print 'PCD extraction performed in %s and for %s analysis'%(
        log_entries[log_id]['date'],log_entries[log_id]['package'])
    
    # Read train info file
    train_info_name = '%s/train_info_files/%s_train_info.jbl'%(
        result_analysis_path,log_entries[log_id]['date'])
    
    [train_info] = joblib.load(train_info_name)
    
    # saving time
    if train_info['preprocessing_extraction_done']:
        print 'Preprocessing Extraction done, just analyse it'
        continue
    
    trn_params = preproc.TrnParams(learning_rate=0.005, 
                                   verbose=True,
                                   train_verbose=False)
    
    trgt_sparse = np_utils.to_categorical(all_trgt)
    
    pcds = {}
    pcd_objs = {}
    
    for ifold in range(len(train_info['CVO'])):
        print 'PCD extraction process: fold %i of %i'%(ifold+1,len(train_info['CVO']))
        # split data in trn set, tst set
        train_id, test_id = train_info['CVO'][ifold]

        # normalize data based in train set
        if train_info['norm'] == 'mapstd':
            scaler = preprocessing.StandardScaler().fit(all_data[train_id,:])
        elif train_info['norm'] == 'mapstd_rob':
            scaler = preprocessing.RobustScaler().fit(all_data[train_id,:])
        elif train_info['norm'] == 'mapminmax':
            scaler = preprocessing.MinMaxScaler().fit(all_data[train_id,:])
                
        norm_data = scaler.transform(all_data)
            
        pcd = preproc.PCDIndependent(n_components=train_info['n_pcds'])
            
        pcd.fit(norm_data, trgt_sparse, 
                train_id, test_id, trn_params=trn_params)

        pcd_objs[ifold] = pcd
        pcds[ifold] = pcd.pcds

    print 'Extraction done'
    # saving file
    pcd_file_path = result_analysis_path+'/result_files'+'/'+choose_date+'_pcd_file.jbl'

    if pcds != {}:
        joblib.dump([pcds],pcd_file_path,compress=9)

    train_info_name = result_analysis_path+'/train_info_files'+'/'+choose_date+'_train_info.jbl'

    train_info['preprocessing_extraction_done'] = True
    joblib.dump([train_info],train_info_name,compress=9)

In [None]:
pcds