# ShipsEar Read Data    
Este notebook faz carga de dados do ShipsEar.  
Aqui, a ideia é fazer um csv com as informações dos arquivos para ser lido sequencialmente


## Carga de bibliotecas

In [1]:
import os
import warnings
from scipy import signal
from scipy.signal import decimate, convolve, spectrogram, lfilter, cheby2, butter, cheb2ord, hilbert
from librosa import stft, fft_frequencies, frames_to_time, feature
import numpy as np
import math
import pandas as pd
import soundfile as sf

## Ler arquivo de configuração de treinamento

In [2]:
config_file_path = '../data/config.csv'
df_config = pd.read_csv(config_file_path)

train_id = 0

In [3]:
df_config

Unnamed: 0,hash_id,label,wav_files_path,processed_file_folder,processed_file_path,wav_files_info,cv_alg,cv_folds,cv_path,preproc_alg,...,pipeline_path,scaler_alg,train_data_path,train_trgt_path,target_label_file,target_id_file,model_path,model_inits,model_neurons,model_status
0,3359727688101666398,ShipsEar NN Classification,../data/shipsEar_AUDIOS,../data,../data/3359727688101666398_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,Lofar,...,../data/pipelines,StandardScaler,../data/3359727688101666398_train_data.csv,../data/3359727688101666398_trgt_data.csv,../data/models/train_label_file.csv,../data/models/train_id_file.csv,../data/models,1,../data/models/3359727688101666398_hidden_neur...,../data/models/3359727688101666398_model_statu...


## Definição de Funções auxiliares

In [4]:
def extract_slicing_windows(array, n_samples_per_window, perc_overlap, window):
    examples = []
    start = 0
    window_samples = signal.get_window(window, n_samples_per_window)
    for i in range(0,array.shape[0]-n_samples_per_window, int(np.floor(perc_overlap*n_samples_per_window))):
        example = array[start+i:start+n_samples_per_window+i]
        example = example*window_samples
        examples.append(np.expand_dims(example, 0))
    
    return np.vstack(examples)

## Criar um arquivo sequencial de leitura

In [5]:
folder = df_config['wav_files_path'][train_id]
files_path = [os.path.join(folder, file_name) for file_name in os.listdir(folder)]
all_files = [arq for arq in files_path if os.path.isfile(arq)]
wav_files = [arq for arq in all_files if arq.lower().endswith(".wav")]


'''
A pasta do shipsear está organizada da seguinte maneira:

ID__xx_yy_zz_name.wav

onde:
-‘ID’ is the file identifier that follows the ‘id’ field in the online database.
-‘xx->day yy->month zz->year’ of the recording, when available
-‘name’ is an internal identifier. 

''' 
import re

df_file_info = None
for idx, wav_file in enumerate(wav_files):
    
    file_name = wav_file.split('/')[3]
    
    file_id = int(re.findall(r'(.*?)__.*',file_name)[0])
    
    if file_id < 85:
        file_day = int(re.findall(r'\d+__(.*?)_.*',file_name)[0])
        file_month = int(re.findall(r'\d+__\d+_(.*?)_.*',file_name)[0])
        file_year = int(re.findall(r'\d+__\d+_\d+_(.*?)_.*',file_name)[0])
        file_ship_name = re.search(r'__\d+_\d+_\d+_(.*?).wav', file_name).group(1).lower()
        file_obs = np.NaN
    else: 
        file_day = np.NaN
        file_month = np.NaN
        file_year = np.NaN
        file_ship_name = re.search(r'__[A-Z]__(.*?).wav', file_name).group(1).lower()
        file_obs = re.search(r'__(.*?)__.*.wav',file_name).group(1)
    
    print('Processing - Filename:',file_name,'ID:',file_id, 
          'day:', file_day, 'month:', file_month, 
          'year:',file_year, 'ship_name:', file_ship_name, 
          'obs:', file_obs)
    
    dict_buffer = {'ID':[file_id], 
                   'day':[file_day],
                   'month':[file_month],
                   'year':[file_year],
                   'ship':[file_ship_name],
                   'obs':[file_obs],
                   'file_name':[file_name],
                   'file_path':[wav_file]
                  }
    df_buffer = pd.DataFrame(data=dict_buffer)
    
    if df_file_info is None:
        df_file_info = df_buffer
    else:
        df_file_info = pd.concat([df_file_info,df_buffer],axis=0, ignore_index=True)

df_file_info.sort_values(by='ID', inplace=True);
df_file_info.to_csv(df_config['wav_files_info'][train_id],index=False)

Processing - Filename: 10__10_07_13_marDeOnza_Sale.wav ID: 10 day: 10 month: 7 year: 13 ship_name: mardeonza_sale obs: nan
Processing - Filename: 11__10_07_13_minhoUno_Entra.wav ID: 11 day: 10 month: 7 year: 13 ship_name: minhouno_entra obs: nan
Processing - Filename: 12__10_07_13_minhoUno_Sale.wav ID: 12 day: 10 month: 7 year: 13 ship_name: minhouno_sale obs: nan
Processing - Filename: 13__10_07_13_piraCies_Entra.wav ID: 13 day: 10 month: 7 year: 13 ship_name: piracies_entra obs: nan
Processing - Filename: 14__10_07_13_piraCies_Espera.wav ID: 14 day: 10 month: 7 year: 13 ship_name: piracies_espera obs: nan
Processing - Filename: 15__10_07_13_radaUno_Pasa.wav ID: 15 day: 10 month: 7 year: 13 ship_name: radauno_pasa obs: nan
Processing - Filename: 16__10_07_13_mscOpera_InicioSalida.wav ID: 16 day: 10 month: 7 year: 13 ship_name: mscopera_iniciosalida obs: nan
Processing - Filename: 17__10_07_13_visionSub_Entra.wav ID: 17 day: 10 month: 7 year: 13 ship_name: visionsub_entra obs: nan
Proc

In [6]:
df_file_info.head()

Unnamed: 0,ID,day,month,year,ship,obs,file_name,file_path
59,6,10.0,7.0,13.0,mardecangas_entra,,6__10_07_13_marDeCangas_Entra.wav,../data/shipsEar_AUDIOS/6__10_07_13_marDeCanga...
70,7,10.0,7.0,13.0,mardecangas_espera,,7__10_07_13_marDeCangas_Espera.wav,../data/shipsEar_AUDIOS/7__10_07_13_marDeCanga...
81,8,10.0,7.0,13.0,mardeonza_entra,,8__10_07_13_marDeOnza_Entra.wav,../data/shipsEar_AUDIOS/8__10_07_13_marDeOnza_...
89,9,10.0,7.0,13.0,mardeonza_espera,,9__10_07_13_marDeOnza_Espera.wav,../data/shipsEar_AUDIOS/9__10_07_13_marDeOnza_...
0,10,10.0,7.0,13.0,mardeonza_sale,,10__10_07_13_marDeOnza_Sale.wav,../data/shipsEar_AUDIOS/10__10_07_13_marDeOnza...


In [7]:
df_file_info = pd.read_csv(df_config['wav_files_info'][train_id])
df_file_info.sort_values(by='ID', inplace=True);

In [8]:
df_file_info.head()

Unnamed: 0,ID,day,month,year,ship,obs,file_name,file_path
0,6,10.0,7.0,13.0,mardecangas_entra,,6__10_07_13_marDeCangas_Entra.wav,../data/shipsEar_AUDIOS/6__10_07_13_marDeCanga...
1,7,10.0,7.0,13.0,mardecangas_espera,,7__10_07_13_marDeCangas_Espera.wav,../data/shipsEar_AUDIOS/7__10_07_13_marDeCanga...
2,8,10.0,7.0,13.0,mardeonza_entra,,8__10_07_13_marDeOnza_Entra.wav,../data/shipsEar_AUDIOS/8__10_07_13_marDeOnza_...
3,9,10.0,7.0,13.0,mardeonza_espera,,9__10_07_13_marDeOnza_Espera.wav,../data/shipsEar_AUDIOS/9__10_07_13_marDeOnza_...
4,10,10.0,7.0,13.0,mardeonza_sale,,10__10_07_13_marDeOnza_Sale.wav,../data/shipsEar_AUDIOS/10__10_07_13_marDeOnza...


## Pré-Processamento dos Dados   
Aqui, eu vou fazer a extração dos MFCC com base em um processo de janelamento.   
Esse processo deve ser feito para evitar a perda de estacionaridade dos sinais de sonar passivo.  
Estou usando como base uma janela boxcar (retangular) de 1024 pontos,

In [9]:
df_mfcc_data = None

for idx, m_id in enumerate(df_file_info['ID']):
    print('Processing: ',df_file_info.iloc[idx]['file_name'])
    m_signal, fs = sf.read(df_file_info.iloc[idx]['file_path'])
    mfcc_coeff = feature.mfcc(y=m_signal, sr=fs, n_mfcc=20, n_fft=1024,
                              window='boxcar', win_length=1024, 
                              hop_length=1024, dct_type=2, norm='ortho')
    df_buffer = pd.DataFrame(mfcc_coeff.T)
    df_buffer = df_buffer.add_prefix('feature_')
    df_buffer['ID'] = df_file_info.iloc[idx]['ID']
    
    if df_mfcc_data is None:
        df_mfcc_data = df_buffer
    else:
        df_mfcc_data = pd.concat([df_mfcc_data,df_buffer],axis=0, ignore_index=True)
df_mfcc_data.to_csv(df_config['processed_file_path'][train_id],index=False)

Processing:  6__10_07_13_marDeCangas_Entra.wav
Processing:  7__10_07_13_marDeCangas_Espera.wav
Processing:  8__10_07_13_marDeOnza_Entra.wav
Processing:  9__10_07_13_marDeOnza_Espera.wav
Processing:  10__10_07_13_marDeOnza_Sale.wav
Processing:  11__10_07_13_minhoUno_Entra.wav
Processing:  12__10_07_13_minhoUno_Sale.wav
Processing:  13__10_07_13_piraCies_Entra.wav
Processing:  14__10_07_13_piraCies_Espera.wav
Processing:  15__10_07_13_radaUno_Pasa.wav
Processing:  16__10_07_13_mscOpera_InicioSalida.wav
Processing:  17__10_07_13_visionSub_Entra.wav
Processing:  18__18_07_13_AutoPrideEntra.wav
Processing:  19__18_07_13_AutoprideMarchaAtras.wav
Processing:  20__18_07_13_AutopridePrepManiobra.wav
Processing:  21__18_07_13_lanchaMotora.wav
Processing:  22__19_07_13_adventure_maniobra.wav
Processing:  23__19_07_13_adventure_parado.wav
Processing:  24__19_07_13_adventureFrenando_duda.wav
Processing:  25__19_07_13_adventureOfTheSea_llegando.wav
Processing:  26__19_07_13_Lancha.wav
Processing:  2

In [10]:
df_mfcc_data.shape

(579671, 21)

In [11]:
df_mfcc_data = pd.read_csv(df_config['processed_file_path'][train_id])

In [12]:
df_mfcc_data.shape

(579671, 21)

In [13]:
print(fs)

52734


## Criar um dicionário de Classes para Classificação

In [14]:
df_file_info

Unnamed: 0,ID,day,month,year,ship,obs,file_name,file_path
0,6,10.0,7.0,13.0,mardecangas_entra,,6__10_07_13_marDeCangas_Entra.wav,../data/shipsEar_AUDIOS/6__10_07_13_marDeCanga...
1,7,10.0,7.0,13.0,mardecangas_espera,,7__10_07_13_marDeCangas_Espera.wav,../data/shipsEar_AUDIOS/7__10_07_13_marDeCanga...
2,8,10.0,7.0,13.0,mardeonza_entra,,8__10_07_13_marDeOnza_Entra.wav,../data/shipsEar_AUDIOS/8__10_07_13_marDeOnza_...
3,9,10.0,7.0,13.0,mardeonza_espera,,9__10_07_13_marDeOnza_Espera.wav,../data/shipsEar_AUDIOS/9__10_07_13_marDeOnza_...
4,10,10.0,7.0,13.0,mardeonza_sale,,10__10_07_13_marDeOnza_Sale.wav,../data/shipsEar_AUDIOS/10__10_07_13_marDeOnza...
...,...,...,...,...,...,...,...,...
85,92,,,,8h_n,E,92__E__8H_N.wav,../data/shipsEar_AUDIOS/92__E__8H_N.wav
86,93,,,,draga_1,A,93__A__Draga_1.wav,../data/shipsEar_AUDIOS/93__A__Draga_1.wav
87,94,,,,draga_2,A,94__A__Draga_2.wav,../data/shipsEar_AUDIOS/94__A__Draga_2.wav
88,95,,,,draga_3,A,95__A__Draga_3.wav,../data/shipsEar_AUDIOS/95__A__Draga_3.wav


In [15]:
# Passagers   = Class 1
# Tugboat     = Class 2
# Ocean liner = Class 3
# RORO        = Class 4
# Motorboat   = Class 5
# Trawler     = Class 6
# Pilot ship  = Class 7
# Sailboat    = Class 8
# Mussel boat = Class 9
# Fishboat    = Class 0
# Dredger     = Class 10
# Natural ambient noise = Class 11

dict_buffer = { '6':[1],  '7':[1],  '8':[1],  '9':[1], '10':[1], '11':[1], '12':[1],
               '13':[1], '14':[1], '15':[2], '16':[3], '17':[1], '18':[4], '19':[4],
               '20':[4], '21':[5], '22':[3], '23':[3], '24':[3], '25':[3], '26':[5],
               '27':[5], '28':[6], '29':[7], '30':[7], '31':[2], '32':[1], '33':[5],
               '34':[1], '35':[1], '36':[1], '37':[8], '38':[1], '39':[5], '40':[1],
               '41':[1], '42':[1], '43':[1], '45':[5], '46':[9], '47':[9], '48':[9],
               '49':[9], '50':[5], '51':[5], '52':[5], '53':[1], '54':[1], '55':[1],
               '56':[8], '57':[8], '58':[4], '59':[1], '60':[1], '61':[1], '62':[1],
               '63':[1], '64':[1], '65':[1], '66':[9], '67':[1], '68':[8], '69':[3],
               '70':[5], '71':[3], '72':[5], '73':[0], '74':[0], '75':[0], '76':[0],
               '77':[5], '78':[4], '79':[5], '80':[10], '81':[11], '82':[11], '83':[11],
               '84':[11], '85':[11], '86':[11], '87':[11], '88':[11], '89':[11], '90':[11],
               '91':[11], '92':[11], '93':[10], '94':[10], '95':[10], '96':[10]
              }
df_buffer = pd.DataFrame(data=dict_buffer).T
df_buffer.columns = ['target']

df_buffer.to_csv(df_config['target_id_file'][train_id])

In [16]:
df_buffer.index

Index(['6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41',
       '42', '43', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54',
       '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
       '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78',
       '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90',
       '91', '92', '93', '94', '95', '96'],
      dtype='object')

In [17]:
dict_buffer = {'classes':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 
               'names':['Fishboat', 'Passagers', 'Tugboat', 'Ocean liner', 'RORO',
                        'Motorboat', 'Trawler', 'Pilot ship', 'Sailboat', 'Mussel boat',
                        'Dredger', 'Natural Noise'
                       ]}
df_buffer = pd.DataFrame(data=dict_buffer)
df_buffer.to_csv(df_config['target_label_file'][train_id],index=False)

In [18]:
df_id2classes = pd.read_csv(df_config['target_label_file'][train_id])
df_id2classes.rename({"Unnamed: 0":"ID"}, axis="columns", inplace=True)
df_id2classes

Unnamed: 0,classes,names
0,0,Fishboat
1,1,Passagers
2,2,Tugboat
3,3,Ocean liner
4,4,RORO
5,5,Motorboat
6,6,Trawler
7,7,Pilot ship
8,8,Sailboat
9,9,Mussel boat


## Juntar as informações de classes ao banco de dados de MFCC

In [19]:
df_mfcc_data = pd.read_csv(df_config['processed_file_path'][train_id])
df_id2classes = pd.read_csv(df_config['target_id_file'][train_id])
df_id2classes.rename({"Unnamed: 0":"ID"}, axis="columns", inplace=True)

df_train = pd.merge(df_mfcc_data, df_id2classes, how='inner', on = 'ID')
df_train = df_train.drop(columns=['ID'])

In [20]:
df_train

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,-593.984651,75.465077,33.994589,9.742745,6.304904,4.062274,-4.479960,-12.025640,-13.893243,-10.707881,...,2.772886,4.079980,1.589406,1.082082,3.737450,5.702034,4.425509,0.643910,-3.701610,1
1,-564.030604,99.217798,30.917974,5.678145,13.161791,10.325374,-5.327357,-13.766686,-10.004396,-1.441678,...,10.295145,11.006458,7.362272,3.057871,2.907891,4.579520,3.148390,-0.558448,-4.547724,1
2,-565.570117,97.768627,28.350012,-2.708773,-1.175778,-3.744399,-13.164265,-16.510932,-13.678804,-9.134033,...,-4.559905,-1.560540,3.407577,2.858186,-2.977739,-5.271391,-3.584360,-3.995161,-6.621560,1
3,-574.731085,89.236715,27.436792,1.816695,10.781130,17.176567,6.668042,-10.990568,-21.065106,-14.921798,...,12.830178,16.924017,15.915613,10.503721,4.520189,3.524796,4.321097,0.906166,-3.552916,1
4,-574.807084,96.936057,42.420398,7.884993,4.932927,9.485642,2.500431,-10.264076,-14.679812,-7.856754,...,9.196721,12.108410,12.054569,9.160051,5.082737,2.006297,-0.259070,-1.882971,-1.695872,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579666,-129.211622,103.735884,-4.451267,13.362107,0.403481,16.740188,2.191302,-10.607187,-5.056585,9.556094,...,-4.792779,-10.805321,6.906666,-2.582687,-8.632170,-2.974505,-10.848904,-9.750429,-1.552197,10
579667,-126.285006,99.152346,-9.378488,4.103901,-10.917993,9.898384,3.995455,-4.722410,-9.989188,5.963095,...,-4.501935,-14.456355,-2.633846,-12.984082,-11.841886,-5.052002,-13.695508,-5.571090,5.285179,10
579668,-135.749433,98.374239,2.772493,20.607540,-1.142374,5.856288,-2.922496,-4.174614,-5.148741,9.535130,...,-6.730832,-15.778987,-1.742991,-9.774796,-2.168072,8.606211,-9.956886,-11.800741,1.299773,10
579669,-147.588836,95.654410,-0.590526,29.047297,-1.389854,7.434825,-1.275909,-5.044168,-4.136681,-3.874674,...,-6.146726,-20.977481,-0.186768,-1.316611,-1.148685,1.789382,-13.106041,-20.388461,-12.065845,10


In [21]:
df_train.to_csv(df_config['train_data_path'][train_id],index=False)

In [22]:
df_train.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,-593.984651,75.465077,33.994589,9.742745,6.304904,4.062274,-4.47996,-12.02564,-13.893243,-10.707881,...,2.772886,4.07998,1.589406,1.082082,3.73745,5.702034,4.425509,0.64391,-3.70161,1
1,-564.030604,99.217798,30.917974,5.678145,13.161791,10.325374,-5.327357,-13.766686,-10.004396,-1.441678,...,10.295145,11.006458,7.362272,3.057871,2.907891,4.57952,3.14839,-0.558448,-4.547724,1
2,-565.570117,97.768627,28.350012,-2.708773,-1.175778,-3.744399,-13.164265,-16.510932,-13.678804,-9.134033,...,-4.559905,-1.56054,3.407577,2.858186,-2.977739,-5.271391,-3.58436,-3.995161,-6.62156,1
3,-574.731085,89.236715,27.436792,1.816695,10.78113,17.176567,6.668042,-10.990568,-21.065106,-14.921798,...,12.830178,16.924017,15.915613,10.503721,4.520189,3.524796,4.321097,0.906166,-3.552916,1
4,-574.807084,96.936057,42.420398,7.884993,4.932927,9.485642,2.500431,-10.264076,-14.679812,-7.856754,...,9.196721,12.10841,12.054569,9.160051,5.082737,2.006297,-0.25907,-1.882971,-1.695872,1


In [23]:
df_train.shape

(579671, 21)