# Load Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import joblib

import Models
from AuxiliarFunctions import *

# Load Data

In [2]:
config_file_path = 'data/config.csv'
df_config = pd.read_csv(config_file_path)
train_id = 0

df_config

Unnamed: 0,hash_id,label,raw_files_path,processed_file_folder,processed_file_path,cv_alg,cv_folds,cv_path,preproc_alg,pipeline_path,scaler_alg,train_data_path,train_trgt_path,model_path
0,-8662869763806803064,Toy Data Classification with 10 StratifiedKFolds,data/raw,data,data/-8662869763806803064_processed_data.csv,StratifiedKFolds,10,data/indexes,Não implementado para a aplicação!!!,data/pipelines,StandardScaler,data/-8662869763806803064_train_data.csv,data/-8662869763806803064_trgt_data.csv,data/models


In [3]:
df_data = pd.read_csv(df_config['train_data_path'][train_id])
df_trgt = pd.read_csv(df_config['train_trgt_path'][train_id])

In [4]:
df_trgt

Unnamed: 0,target
0,1
1,1
2,0
3,0
4,1
...,...
9995,1
9996,0
9997,0
9998,0


# Processo de Treinamento de um modelo simples

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time
import copy
import importlib
importlib.reload(Models)

model_type = 'Kernel'

n_folds =  df_config['cv_folds'][train_id]
cv_path = df_config['cv_path'][train_id]
pipe_path = df_config['pipeline_path'][train_id]
model_path = df_config['model_path'][train_id]

print('Processing Kernel Training')

kernels = ['polynomial']
losses = ['logistic', 'soft_margin','quadratic_soft_margin','squared_loss','e-insensitive','huber']

if os.path.exists(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type))):
	train_record = pd.read_csv(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type)))
else:
	train_record = None

for kernel in kernels:
	print('Processing Training for %s kernel'%(kernel))
	for loss in losses:
		print('Processing Training for %s loss'%(loss))
		for ifold in range(n_folds):

			print('Training %i fold of %i folds\n\n'%(ifold+1, n_folds))

			print('Reading Cross-validation indexes')

			cv_name = '%s_%s_CV_fold_%i_of_%i_cv_indexes.pkl'%(df_config['hash_id'][train_id],
															df_config['cv_alg'][train_id],
															ifold, n_folds)
			print('Done')
			print('Reading Pipeline Object')
			with open(os.path.join(cv_path,cv_name),'rb') as file_handler:
				[trn_idx,val_idx] = pickle.load(file_handler)

			pipe_name ='%s_%s_CV_fold_%i_of_%i_pipe.pkl'%(df_config['hash_id'][train_id],
															df_config['cv_alg'][train_id],
															ifold, n_folds)
			with open(os.path.join(pipe_path,pipe_name),'rb') as file_handler:
				pipe = joblib.load(file_handler)

			print('Done')

			trn_data = pipe.transform(df_data)
			trn_trgt = df_trgt.values # tf.keras.utils.to_categorical(dev_target, num_classes=len(np.unique(dev_target)))

			print('Training for model')
			model_name = '%s_%s_%i_fold_model_%s_kernel_%s_loss.pkl'%(df_config['hash_id'][train_id],
																				model_type, ifold, kernel, 
																				loss)

			if os.path.exists(os.path.join(model_path, model_name)):
				print('Model is in %s'%(os.path.join(model_path, model_name)))
				
			else:
				print('No Model \n\n')

				model = Models.KernelClassifier(kernel=kernel,
                                                      loss=loss,
                                                      verbose=False)
				start_time = time.time()
				model.fit(trn_data, trn_trgt, trn_id=trn_idx, val_id=val_idx, random_state=0,)
				end_time = time.time() # in seconds

				predictions = model.predict(trn_data)
				df_predict = pd.DataFrame(data=np.concatenate((trn_trgt, 
															predictions),
															axis=1), 
										columns=['target', 'model_output'])
				prediction_name = copy.copy(model_name)
				prediction_name = prediction_name.replace('.pkl','_prediction_file.csv')
				df_predict.to_csv(os.path.join(model_path, prediction_name),index=False)
			
				model.save(os.path.join(model_path, model_name))
				
				acc = Models.acc_score(df_predict.loc[val_idx,'target'],
									df_predict.loc[val_idx,'model_output'])
				sens = Models.sensitivity_score(df_predict.loc[val_idx,'target'],
												df_predict.loc[val_idx,'model_output'])
				spec = Models.specificity_score(df_predict.loc[val_idx,'target'],
												df_predict.loc[val_idx,'model_output'])
				sp = Models.sp_index(df_predict.loc[val_idx,'target'],
									df_predict.loc[val_idx,'model_output'])
				auc = Models.auc_score(df_predict.loc[val_idx,'target'],
									df_predict.loc[val_idx,'model_output'])
				
				dict_train_record = {
					'hash_id':[df_config['hash_id'][train_id]],'fold':[ifold],
					'prediction_file':[prediction_name], 'kernel':[kernel],
					'loss':[loss], 'Acc':[acc],
					'Sens':[sens],'Spec':[spec],'SP':[sp], 'AUC':[auc],
					'Time':[end_time-start_time]
				}
				if train_record is None:
					train_record = pd.DataFrame(data=dict_train_record)
				else:
					train_record = pd.concat([train_record,pd.DataFrame(data=dict_train_record)],axis=0, ignore_index=True)
train_record.to_csv(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type)),index=False)


Processing Kernel Training
Processing Training for polynomial kernel
Processing Training for logistic loss
Training 1 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/-8662869763806803064_Kernel_0_fold_model_polynomial_kernel_logistic_loss.pkl
Training 2 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/-8662869763806803064_Kernel_1_fold_model_polynomial_kernel_logistic_loss.pkl
Training 3 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/-8662869763806803064_Kernel_2_fold_model_polynomial_kernel_logistic_loss.pkl
Training 4 fold of 10 folds


Reading Cross-validation indexes
Done
Reading Pipeline Object
Done
Training for model
Model is in data/models/-8662869763806803064_Kernel_3_fold_model_polynomial_kernel_logistic_loss.pkl
Training 5 fold of 10 fol

In [6]:
train_record = pd.read_csv(os.path.join(model_path,'%s_%s_train_record.csv'%(df_config['hash_id'][train_id], model_type)))
train_record

Unnamed: 0,hash_id,fold,prediction_file,kernel,loss,Acc,Sens,Spec,SP,AUC,Time
0,-8662869763806803064,0,-8662869763806803064_Kernel_0_fold_model_polyn...,polynomial,logistic,0.845,0.792415,0.897796,0.712814,0.845105,2.070694
1,-8662869763806803064,1,-8662869763806803064_Kernel_1_fold_model_polyn...,polynomial,logistic,0.842,0.794,0.89,0.707811,0.842,2.155312
2,-8662869763806803064,2,-8662869763806803064_Kernel_2_fold_model_polyn...,polynomial,logistic,0.845,0.818,0.872,0.71366,0.845,2.155366
3,-8662869763806803064,3,-8662869763806803064_Kernel_3_fold_model_polyn...,polynomial,logistic,0.857,0.81,0.904,0.733344,0.857,2.106785
4,-8662869763806803064,4,-8662869763806803064_Kernel_4_fold_model_polyn...,polynomial,logistic,0.841,0.816,0.866,0.706968,0.841,2.117187
5,-8662869763806803064,5,-8662869763806803064_Kernel_5_fold_model_polyn...,polynomial,logistic,0.832,0.792,0.872,0.691424,0.832,2.006963
6,-8662869763806803064,6,-8662869763806803064_Kernel_6_fold_model_polyn...,polynomial,logistic,0.822,0.768,0.876,0.674224,0.822,2.315056
7,-8662869763806803064,7,-8662869763806803064_Kernel_7_fold_model_polyn...,polynomial,logistic,0.826,0.776,0.876,0.681025,0.826,2.139889
8,-8662869763806803064,8,-8662869763806803064_Kernel_8_fold_model_polyn...,polynomial,logistic,0.848,0.816,0.88,0.718592,0.848,2.125958
9,-8662869763806803064,9,-8662869763806803064_Kernel_9_fold_model_polyn...,polynomial,logistic,0.862,0.814,0.91,0.741891,0.862,2.132304


In [7]:
mean_grouped = train_record[['kernel','loss',
                        'Acc','Sens','Spec','SP', 'AUC', 'Time']].groupby(['kernel', 'loss']).mean()
std_grouped = train_record[['kernel','loss',
                        'Acc','Sens','Spec','SP', 'AUC', 'Time']].groupby(['kernel', 'loss']).std()
grouped = pd.concat([mean_grouped,std_grouped],axis=0)

In [8]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Acc,Sens,Spec,SP,AUC,Time
kernel,loss,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
polynomial,e-insensitive,0.8391,0.80584,0.872377,0.70358,0.839109,1.355994
polynomial,huber,0.833,0.770649,0.895382,0.692042,0.833015,0.630653
polynomial,logistic,0.842,0.799642,0.88438,0.708175,0.842011,2.132551
polynomial,quadratic_soft_margin,0.8375,0.782046,0.892982,0.699977,0.837514,0.674169
polynomial,soft_margin,0.8493,0.810041,0.888581,0.72061,0.849311,0.601162
polynomial,squared_loss,0.8287,0.76145,0.895982,0.684518,0.828716,0.194703
polynomial,e-insensitive,0.008962,0.015222,0.010714,0.015145,0.008964,0.051746
polynomial,huber,0.012211,0.017513,0.015902,0.020628,0.012208,0.03241
polynomial,logistic,0.012632,0.017914,0.015118,0.02135,0.012634,0.078133
polynomial,quadratic_soft_margin,0.012756,0.019718,0.014002,0.021582,0.012757,0.049932


In [9]:
choose_hyperparameters = {'criteria':['Spec'],'kernel':['polynomial'],'loss':['soft_margin']}
df_choose_hyperparameters = pd.DataFrame(data=choose_hyperparameters)
df_choose_hyperparameters

Unnamed: 0,criteria,kernel,loss
0,Spec,polynomial,soft_margin


In [10]:
df_choose_hyperparameters.to_csv(os.path.join(model_path,'%s_%s_choose_hyperparameters.csv'%(df_config['hash_id'][train_id], model_type)),index=False)