# Train Configuration Files

Cada treinamento vai ter um identificador hash que vai permear todo o processo.

In [1]:
!pip install ..

Processing /tf
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: src
  Building wheel for src (setup.py) ... [?25ldone
[?25h  Created wheel for src: filename=src-0.0.1-py3-none-any.whl size=14284 sha256=ee70aed44ed8eca0dc565a59bf2adb55aebea9edad0cdfee66e1c9760f2a4c99
  Stored in directory: /tmp/pip-ephem-wheel-cache-94caehfl/wheels/b8/4c/5d/041c4fc7e6c2e6d5dbdf9e9296283834aead4dadd6ecfdb44e
Successfully built src
Installing collected packages: src
  Attempting uninstall: src
    Found existing installation: src 0.0.1
    Uninstalling src-0.0.1:
      Successfully uninstalled src-0.0.1
Successfully installed src-0.0.1
[0m

In [2]:
import os
import pandas as pd
import pickle
import numpy as np
import src
from src.functions.AuxiliarFunctions import *

In [3]:
# Configurações gerais
hash_id = hash('treinamento_dev_1')
label = 'Toy Data Classification' # etiqueta do treinamento
wav_files_path = '../data/shipsEar_AUDIOS' # caminho para os arquivos wav
processed_file_folder =  '../data' # caminho para o arquivo único com todos os dados crus - processed_data.csv
processed_file_path =  os.path.join('../data','%s_processed_data.csv'%(hash_id)) # caminho para o arquivo único com todos os dados crus - processed_data.csv
wav_files_info = os.path.join(processed_file_folder,'wav_file_informations.csv')

# Configurações da Validação Cruzada
cv_alg = 'StratifiedKFolds'
cv_folds = [5]
cv_path = os.path.join(processed_file_folder,'indexes')

# Configurações de Pré-Processamento
preproc_alg = 'MFCC'
preproc_n_fft_points = 1024
preproc_overlap = 0
preproc_spectrum_bins_left = 400

# Configurações de Pipeline
pipeline_path = os.path.join(processed_file_folder,'pipelines')
scaler_alg = 'StandardScaler'

# Arquivos de Treinamento
train_data_path = os.path.join(processed_file_folder,'%s_train_data.csv'%(hash_id)) # caminho para o arquivo único com todos os dados para treinamento - train_data_file.csv
train_trgt_path = os.path.join(processed_file_folder,'%s_trgt_data.csv'%(hash_id)) # caminho para o arquivo único com todos os alvos para treinamento - train_trgt_file.csv
target_label_file = os.path.join(processed_file_folder,'models/train_label_file.csv')
target_id_file = os.path.join(processed_file_folder,'models/train_id_file.csv')
model_path = os.path.join(processed_file_folder,'models') # caminho para onde os modelos serão enviados
model_inits = 2 # numero de inicializações que serão feitas durante o treinamento
model_neurons = '../data/models/%s_hidden_neurons.pkl'%(hash_id) # arquivo com todas as possibilidade de quantidade de neuronios na camada escondida
model_status = '../data/models/%s_model_status.pkl'%(hash_id) # arquivo com todas as possibilidade de quantidade de neuronios na camada escondida
model_epochs = 100
model_patience = 5
model_optimizer = 'adam'
model_learning_rate = 0.001
model_batch_size = 100

In [4]:
dict_buffer = { 
    'hash_id':[hash_id],
    'label':[label],
    'wav_files_path':[wav_files_path],
    'processed_file_folder':[processed_file_folder],
    'processed_file_path':[processed_file_path],
    'wav_files_info':[wav_files_info],
    'cv_alg':[cv_alg],
    'cv_folds':cv_folds,
    'cv_path':cv_path,
    'preproc_alg':[preproc_alg],
    'preproc_n_fft_points':[preproc_n_fft_points],
    'preproc_overlap':[preproc_overlap],
    'preproc_spectrum_bins_left':[preproc_spectrum_bins_left],
    'pipeline_path':[pipeline_path],
    'scaler_alg':[scaler_alg],
    'train_data_path':[train_data_path],
    'train_trgt_path':[train_trgt_path],
    'target_label_file':[target_label_file],
    'target_id_file':[target_id_file],
    'model_path':[model_path],
    'model_inits':model_inits,
    'model_neurons':[model_neurons],
    'model_status':[model_status],
    'model_epochs':[model_epochs],
    'model_patience':[model_patience],
    'model_learning_rate':[model_learning_rate],
    'model_optimizer':[model_optimizer],
    'model_learning_rate':[model_learning_rate],
    'model_batch_size':[model_batch_size],
}
df_buffer = pd.DataFrame(data=dict_buffer)

In [5]:
df_buffer.head()

Unnamed: 0,hash_id,label,wav_files_path,processed_file_folder,processed_file_path,wav_files_info,cv_alg,cv_folds,cv_path,preproc_alg,...,target_id_file,model_path,model_inits,model_neurons,model_status,model_epochs,model_patience,model_learning_rate,model_optimizer,model_batch_size
0,-8564343657574404315,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/-8564343657574404315_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,2,../data/models/-8564343657574404315_hidden_neu...,../data/models/-8564343657574404315_model_stat...,100,5,0.001,adam,100


In [6]:
config_file_path = '../data/config.csv'
if os.path.exists(config_file_path) == False:
    # arquivo não existe
    df_buffer.to_csv(config_file_path, index=False)
else:
    # arquivo existe
    df_old = pd.read_csv(config_file_path)
    if (hash_id in list(df_old['hash_id'])):
        # o hash_id já está salvo
        print('o hash_id já está salvo')
    else:
        df_old = pd.concat([df_old,df_buffer],axis=0, ignore_index=True)
        df_old.to_csv(config_file_path, index=False)

In [7]:
train_id = 5
df_config = pd.read_csv(config_file_path)

In [8]:
df_config.head()

Unnamed: 0,hash_id,label,wav_files_path,processed_file_folder,processed_file_path,wav_files_info,cv_alg,cv_folds,cv_path,preproc_alg,...,target_id_file,model_path,model_inits,model_neurons,model_status,model_epochs,model_patience,model_learning_rate,model_optimizer,model_batch_size
0,3786470895109500580,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/3786470895109500580_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,5,../data/models/3786470895109500580_hidden_neur...,../data/models/3786470895109500580_model_statu...,1000,100,0.001,adam,
1,-5662987709573759986,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/-5662987709573759986_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,5,../data/models/-5662987709573759986_hidden_neu...,../data/models/-5662987709573759986_model_stat...,1000,100,0.001,adam,
2,3837049506038808913,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/3837049506038808913_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,1,../data/models/3837049506038808913_hidden_neur...,../data/models/3837049506038808913_model_statu...,100,10,0.001,adam,
3,4644399470053765538,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/4644399470053765538_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,5,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,1,../data/models/4644399470053765538_hidden_neur...,../data/models/4644399470053765538_model_statu...,20,10,0.001,adam,
4,5754269668442876343,Toy Data Classification,../data/shipsEar_AUDIOS,../data,../data/5754269668442876343_processed_data.csv,../data/wav_file_informations.csv,StratifiedKFolds,2,../data/indexes,MFCC,...,../data/models/train_id_file.csv,../data/models,2,../data/models/5754269668442876343_hidden_neur...,../data/models/5754269668442876343_model_statu...,10,5,0.001,adam,100.0


In [9]:
#hidden_neurons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
hidden_neurons = [1, 10, 100]
if os.path.exists(df_config['model_neurons'][train_id]) == False:
    write_list_of_hidden_neurons(df_config['model_neurons'][train_id],hidden_neurons)
else:
    print('O arquivo %s já existe'%(df_config['model_neurons'][train_id]))

In [10]:
if os.path.exists(df_config['model_status'][train_id]) == False:
    hidden_neurons = get_list_of_hidden_neurons(df_config['model_neurons'][train_id])
    model_status = np.zeros([df_config['cv_folds'][train_id],len(hidden_neurons),df_config['model_inits'][train_id]],dtype=bool)
    with open(df_config['model_status'][train_id],'wb') as file_handler:
        pickle.dump([model_status],file_handler)
else:
    print('O arquivo %s já existe'%(df_config['model_status'][train_id]))