# Load Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import joblib

import Models
from AuxiliarFunctions import *

2023-03-27 16:13:25.049562: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Load Data

In [2]:
config_file_path = 'data/config.csv'
df_config = pd.read_csv(config_file_path)
train_id = 0

df_config

Unnamed: 0,hash_id,label,raw_files_path,processed_file_folder,processed_file_path,cv_alg,cv_folds,cv_path,preproc_alg,pipeline_path,scaler_alg,train_data_path,train_trgt_path,model_path
0,6111500007297378247,Toy Data Classification with 10 StratifiedKFolds,data/raw,data,data/6111500007297378247_processed_data.csv,StratifiedKFolds,10,data/indexes,Não implementado para a aplicação!!!,data/pipelines,StandardScaler,data/6111500007297378247_train_data.csv,data/6111500007297378247_trgt_data.csv,data/models


In [3]:
df_data = pd.read_csv(df_config['train_data_path'][train_id])
df_trgt = pd.read_csv(df_config['train_trgt_path'][train_id])

In [6]:
df_trgt

Unnamed: 0,target
0,1
1,1
2,1
3,0
4,1
...,...
9995,0
9996,0
9997,1
9998,1


# Processo de Treinamento de um modelo simples

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

import copy

model_type = 'SVM'
n_folds =  df_config['cv_folds'][train_id]

cv_path = df_config['cv_path'][train_id]
pipe_path = df_config['pipeline_path'][train_id]
model_path = df_config['model_path'][train_id]

print('Processing SVM Training')

kernels = ['linear', 'rbf']
regularizations = [0.5, 0.2, 0.9]


if os.path.exists(os.path.join(model_path, 'train_record.csv')):
    train_record = pd.read_csv(os.path.join(model_path, 'train_record.csv'))
else:
    train_record = None


for kernel in kernels:
    print('Processing Training for %s kernel'%(kernel))
    for regularization in regularizations:
        print('Processing Training for %s regularization'%(regularization))
        for ifold in range(n_folds):
            #if ifold >= 2: # para desenvolvimento
            #    break
            print('Training %i fold of %i folds\n\n'%(ifold+1, n_folds))

            print('Reading Cross-validation indexes')

            cv_name = '%s_%s_CV_fold_%i_of_%i_cv_indexes.pkl'%(df_config['hash_id'][train_id],
                                                               df_config['cv_alg'][train_id],
                                                               ifold, n_folds)
            print('Done')
            print('Reading Pipeline Object')
            with open(os.path.join(cv_path,cv_name),'rb') as file_handler:
                [trn_idx,val_idx] = pickle.load(file_handler)

            pipe_name ='%s_%s_CV_fold_%i_of_%i_pipe.pkl'%(df_config['hash_id'][train_id],
                                                               df_config['cv_alg'][train_id],
                                                               ifold, n_folds)
            with open(os.path.join(pipe_path,pipe_name),'rb') as file_handler:
                pipe = joblib.load(file_handler)

            print('Done')

            trn_data = pipe.transform(df_data)
            trn_trgt = df_trgt.values # tf.keras.utils.to_categorical(dev_target, num_classes=len(np.unique(dev_target)))

            print('Training for model')
            model_name = '%s_%s_%i_fold_model_%s_kernel_%s_regularization.pkl'%(df_config['hash_id'][train_id],
                                                                                model_type, ifold, kernel, 
                                                                                str(regularization).replace('.','-'))

            if os.path.exists(os.path.join(model_path, model_name)):
                print('Model is in %s'%(os.path.join(model_path, model_name)))
                
            else:
                print('No Model \n\n')

                model = Models.SVMClassificationModel(kernel=kernel, 
                                                      regularization=regularization, 
                                                      verbose=False)
                start_time = time.time()
                model.fit(trn_data, trn_trgt, trn_id=trn_idx, val_id=val_idx, random_state=0,)
                end_time = time.time() # in seconds

                predictions = model.predict(trn_data)
                df_predict = pd.DataFrame(data=np.concatenate((trn_trgt, 
                                                               predictions[:,np.newaxis]),
                                                              axis=1), 
                                          columns=['target', 'model_output'])
                prediction_name = copy.copy(model_name)
                prediction_name = prediction_name.replace('.pkl','_prediction_file.csv')
                df_predict.to_csv(os.path.join(model_path, prediction_name),index=False)
               
                model.save(os.path.join(model_path, model_name))
                
                acc = Models.acc_score(df_predict.loc[val_idx,'target'],
                                       df_predict.loc[val_idx,'model_output'])
                sens = Models.sensitivity_score(df_predict.loc[val_idx,'target'],
                                                df_predict.loc[val_idx,'model_output'])
                spec = Models.specificity_score(df_predict.loc[val_idx,'target'],
                                                df_predict.loc[val_idx,'model_output'])
                sp = Models.sp_index(df_predict.loc[val_idx,'target'],
                                     df_predict.loc[val_idx,'model_output'])
                auc = Models.auc_score(df_predict.loc[val_idx,'target'],
                                       df_predict.loc[val_idx,'model_output'])
                
                dict_train_record = {
                    'hash_id':[df_config['hash_id'][train_id]],'fold':[ifold],
                    'prediction_file':[prediction_name], 'kernel':[kernel],
                    'regularization':[regularization], 'Acc':[acc],
                    'Sens':[sens],'Spec':[spec],'SP':[sp], 'AUC':[auc],
                    'Time':[end_time-start_time]
                }
                if train_record is None:
                    train_record = pd.DataFrame(data=dict_train_record)
                else:
                    train_record = pd.concat([train_record,pd.DataFrame(data=dict_train_record)],axis=0, ignore_index=True)
train_record.to_csv(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type)),index=False)

Processing SVM Training
Processing Training for linear kernel
Processing Training for 0.5 regularization
Training 1 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_0_fold_model_linear_kernel_0-5_regularization.pkl
Training 2 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_1_fold_model_linear_kernel_0-5_regularization.pkl
Training 3 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_2_fold_model_linear_kernel_0-5_regularization.pkl
Training 4 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_3_fold_model_linear_kernel_0-5_regularization.pkl
Training 5 fold of 10 folds


Reading C

Done
Training for model
Model is in data/models/6111500007297378247_SVM_3_fold_model_rbf_kernel_0-2_regularization.pkl
Training 5 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_4_fold_model_rbf_kernel_0-2_regularization.pkl
Training 6 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_5_fold_model_rbf_kernel_0-2_regularization.pkl
Training 7 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_6_fold_model_rbf_kernel_0-2_regularization.pkl
Training 8 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/6111500007297378247_SVM_7_fold_model_rbf_kernel_0-2_regularization.pkl
Training 9 fold of 10 folds


Reading

In [5]:
train_record = pd.read_csv(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type)))
train_record

Unnamed: 0,hash_id,fold,prediction_file,kernel,regularization,Acc,Sens,Spec,SP,AUC,Time
0,6111500007297378247,0,6111500007297378247_SVM_0_fold_model_linear_ke...,linear,0.5,0.746,0.760479,0.731463,0.556367,0.745971,5.091639
1,6111500007297378247,1,6111500007297378247_SVM_1_fold_model_linear_ke...,linear,0.5,0.734,0.774451,0.693387,0.537815,0.733919,5.131973
2,6111500007297378247,2,6111500007297378247_SVM_2_fold_model_linear_ke...,linear,0.5,0.744,0.788423,0.699399,0.552412,0.743911,4.228797
3,6111500007297378247,3,6111500007297378247_SVM_3_fold_model_linear_ke...,linear,0.5,0.735,0.75,0.72,0.540112,0.735,9.441349
4,6111500007297378247,4,6111500007297378247_SVM_4_fold_model_linear_ke...,linear,0.5,0.746,0.782,0.71,0.555868,0.746,5.279656
5,6111500007297378247,5,6111500007297378247_SVM_5_fold_model_linear_ke...,linear,0.5,0.753,0.802,0.704,0.565807,0.753,3.552583
6,6111500007297378247,6,6111500007297378247_SVM_6_fold_model_linear_ke...,linear,0.5,0.738,0.784,0.692,0.543585,0.738,4.492858
7,6111500007297378247,7,6111500007297378247_SVM_7_fold_model_linear_ke...,linear,0.5,0.752,0.792,0.712,0.564703,0.752,4.612529
8,6111500007297378247,8,6111500007297378247_SVM_8_fold_model_linear_ke...,linear,0.5,0.727,0.764,0.69,0.527844,0.727,3.930783
9,6111500007297378247,9,6111500007297378247_SVM_9_fold_model_linear_ke...,linear,0.5,0.77,0.802,0.738,0.592388,0.77,5.481372


In [6]:
mean_grouped = train_record[['kernel','regularization',
                        'Acc','Sens','Spec','SP', 'AUC', 'Time']].groupby(['kernel', 'regularization']).mean()
std_grouped = train_record[['kernel','regularization',
                        'Acc','Sens','Spec','SP', 'AUC', 'Time']].groupby(['kernel', 'regularization']).std()
grouped = pd.concat([mean_grouped,std_grouped],axis=0)

In [7]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Acc,Sens,Spec,SP,AUC,Time
kernel,regularization,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
linear,0.2,0.7443,0.779735,0.708824,0.553401,0.74428,3.15973
linear,0.5,0.7445,0.779935,0.709025,0.55369,0.74448,5.124354
linear,0.9,0.7443,0.779735,0.708825,0.553387,0.74428,5.480592
rbf,0.2,0.9483,0.955226,0.941365,0.89927,0.948295,1.213752
rbf,0.5,0.9557,0.956825,0.954572,0.91339,0.955698,1.160121
rbf,0.9,0.9592,0.959225,0.959176,0.920095,0.9592,1.051146
linear,0.2,0.012588,0.017607,0.017262,0.018848,0.012596,0.529228
linear,0.5,0.012168,0.017546,0.016625,0.018214,0.012176,1.637987
linear,0.9,0.011945,0.017683,0.016245,0.017853,0.011952,0.356026
rbf,0.2,0.006634,0.007257,0.008956,0.012575,0.006636,0.174232


In [8]:
choose_hyperparameters = {'criteria':['SP'],'kernel':['rbf'],'regularization':['0.9']}
df_choose_hyperparameters = pd.DataFrame(data=choose_hyperparameters)
df_choose_hyperparameters

Unnamed: 0,criteria,kernel,regularization
0,SP,rbf,0.9


In [9]:
df_choose_hyperparameters.to_csv(os.path.join(model_path,'%s_%s_choose_hyperparameters.csv'%(df_config['hash_id'][train_id], model_type)),index=False)