El objetivo de este código es implementar el algoritmo GBM, separando los datos por sujeto.
Va a buscar los datos en una carpeta que se encuentre en el mismo directorio que se llame *datos_baseline*.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/repo_tesis/entorno_tesis_Molina"
!source bin/activate

Mounted at /content/drive
/content/drive/My Drive/repo_tesis/entorno_tesis_Molina


In [None]:
# instalar comet ml en caso que sea necesario
!pip install comet_ml

In [24]:
# Importo las librerías
from lightgbm import LGBMClassifier
import numpy as np
from joblib import load
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import time
from comet_ml import Experiment
import joblib
import matplotlib.pyplot as plt

In [20]:
# me situo en el directorio
%cd "/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline"

# Levanto los datos
features = load("FEATURES_W200_I50.joblib")
label = load("label_W200_I50.joblib")

features_array = np.array(features)
label_array = np.array(label)

/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline


In [12]:
# me situo en el directorio
%cd "/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline"

# Levanto particiones
folds_ind_lista = []
sujetos_array_lista = []

for i in range(0,10):
  folds_ind_lista.append(joblib.load('folds_W200_I50_sujetoTest_' + str(i+1) + '.pkl'))
  sujetos_array_lista.append(joblib.load('arraySujetos_folds_W200_I50_sujetoTest_' + str(i+1) + '.pkl'))

# Aclaracion: sujetos_array va de 0 a 9. Por lo que el sujeto 0 en realidad es el 1 y asi sucesivamente

/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline


In [28]:
for j in range(0,10):

  # Particion de datos de esta iteracion
  folds_ind = folds_ind_lista[j]
  sujetos_array = sujetos_array_lista[j]

  # Voy a usar como sujeto (o fold, es lo mismo) de test el que aparece primero. Mirar primer elemento de sujetos_array, y tener presente
  # la aclaracion hecha en el bloque de codigo anterior

  # por el criterio elegido, se que el primer fold (o sujeto) es para test, asi que lo saco
  folds_ind_test = folds_ind[0]

  # Uno los datos de los folds de entrenamiento
  # me quedo con los folds (o sujetos) de train quitando el primero (posicion 0)
  folds_ind_train = folds_ind[1:]
  # concateno los arrays (folds) en uno solo
  folds_ind_train_array = np.concatenate(folds_ind_train)
  # aplano el array resultante
  folds_ind_train_array = folds_ind_train_array.ravel()

  # Si tuviera mas de un fold de test tendria que hacer lo mismo que hice recien para train, pero no lo hago porque solo tengo uno

  # Defino X_train, y_train, X_test, y_test
  X_train = features_array[folds_ind_train_array, :]
  y_train = label_array[folds_ind_train_array, 1]
  X_test = features_array[folds_ind_test, :]
  y_test = label_array[folds_ind_test, 1]

  start_time = time.time()    # comienzo a medir el tiempo
  # implemento gbm
  clf = LGBMClassifier()
  clf.fit(X_train, y_train, eval_metric='logloss')

  end_time = time.time()
  training_time = end_time - start_time
  # Obtener las iteraciones del mejor modelo
  bst = clf.best_iteration_

  nombre = 'baseline_gbm_sep_sub' + str(j+1) + '_test'

  joblib.dump(clf, nombre + '.pkl')

  y_pred = clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  conf_matrix = confusion_matrix(y_test, y_pred)
  plt.savefig("confusion_matrix.png")

  API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO'

  exp = Experiment(api_key=API_KEY,
                  project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                  auto_param_logging=False)
  exp.set_name(nombre) # Nombre de este experimento
  exp.add_tags(['baseline', 'gbm', 'sep_sub', 'choose_test']) # Tags

  exp.log_metric("accuracy", accuracy)
  exp.log_metric("precision", precision)
  exp.log_metric("recall", recall)
  exp.log_metric("training_time", training_time)
  exp.log_confusion_matrix(y_test, y_pred)
  exp.log_parameter("partition_array", sujetos_array)   # Guarda el arreglo en el experimento
  exp.log_text("El primer sujeto de partition_array es el de test, el resto son todos de train. Se usó la particion que se guarda en datos_baseline")   # Comentario del experimento
  exp.log_model(name=nombre, file_or_folder=nombre + '.pkl')
  exp.end()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.198505 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50353
[LightGBM] [Info] Number of data points in the train set: 58968, number of used features: 240
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.609812
[LightGBM] [Info] Start training from score -2.618380
[LightGBM] [Info] Start training from score -2.596300
[LightGBM] [Info] Start training from score -2.581398
[LightGBM] [Info] Start training from score -2.554641
[LightGBM] [Info] Start training from score -2.554205
[LightGBM] [Info] Start training from score -2.556607
[LightGBM] [Info] Start training from score -2.560989
[LightGBM] [Info] Start training from score -2.606589
[LightGBM] [Info] Start training from score -2.293100
[LightGBM] [Info] Start training from score -2.560550
[LightGBM] [Info] Start training from score -2.747006


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/ee4ea14b62c84993937bb05a7e7912da

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub1_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/ee4ea14b62c84993937bb05a7e7912da
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.27869737315581145
[1;38;5;39mCOMET INFO:[0m     precision     : 0.3012994747136013
[1;38;5;39mCOMET INFO:[0m     recall        : 0.278697373155

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50354
[LightGBM] [Info] Number of data points in the train set: 57963, number of used features: 240
[LightGBM] [Info] Start training from score -2.565105
[LightGBM] [Info] Start training from score -2.604218
[LightGBM] [Info] Start training from score -2.616658
[LightGBM] [Info] Start training from score -2.598171
[LightGBM] [Info] Start training from score -2.582528
[LightGBM] [Info] Start training from score -2.554617
[LightGBM] [Info] Start training from score -2.554839
[LightGBM] [Info] Start training from score -2.557062
[LightGBM] [Info] Start training from score -2.561075
[LightGBM] [Info] Start training from score -2.607489
[LightGBM] [Info] Start training from score -2.296788
[LightGBM] [Info] Start training from score -2.560182
[LightGBM] [Info] Start training from score -2.743933


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/a722d8da50324bed98479da616d15614

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub2_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/a722d8da50324bed98479da616d15614
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.3806186195337498
[1;38;5;39mCOMET INFO:[0m     precision     : 0.40507196324268663
[1;38;5;39mCOMET INFO:[0m     recall        : 0.380618619533

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.186324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50355
[LightGBM] [Info] Number of data points in the train set: 58973, number of used features: 240
[LightGBM] [Info] Start training from score -2.565034
[LightGBM] [Info] Start training from score -2.609436
[LightGBM] [Info] Start training from score -2.618465
[LightGBM] [Info] Start training from score -2.596612
[LightGBM] [Info] Start training from score -2.580810
[LightGBM] [Info] Start training from score -2.554726
[LightGBM] [Info] Start training from score -2.553853
[LightGBM] [Info] Start training from score -2.556910
[LightGBM] [Info] Start training from score -2.561074
[LightGBM] [Info] Start training from score -2.606444
[LightGBM] [Info] Start training from score -2.293016
[LightGBM] [Info] Start training from score -2.560854
[LightGBM] [Info] Start training from score -2.747356


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/5fefd37a01be4abe92516aeefbd061f2

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub3_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/5fefd37a01be4abe92516aeefbd061f2
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.3686295696020169
[1;38;5;39mCOMET INFO:[0m     precision     : 0.42056257932227964
[1;38;5;39mCOMET INFO:[0m     recall        : 0.368629569602

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50355
[LightGBM] [Info] Number of data points in the train set: 57576, number of used features: 240
[LightGBM] [Info] Start training from score -2.565158
[LightGBM] [Info] Start training from score -2.598452
[LightGBM] [Info] Start training from score -2.607128
[LightGBM] [Info] Start training from score -2.585923
[LightGBM] [Info] Start training from score -2.574916
[LightGBM] [Info] Start training from score -2.548362
[LightGBM] [Info] Start training from score -2.545701
[LightGBM] [Info] Start training from score -2.550363
[LightGBM] [Info] Start training from score -2.553929
[LightGBM] [Info] Start training from score -2.595189
[LightGBM] [Info] Start training from score -2.364672
[LightGBM] [Info] Start training from score -2.554153
[LightGBM] [Info] Start training from score -2.738845


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/03fb8fee5f8a40d8905ede3130ea3f1b

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub4_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/03fb8fee5f8a40d8905ede3130ea3f1b
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.3623021582733813
[1;38;5;39mCOMET INFO:[0m     precision     : 0.4094796285828598
[1;38;5;39mCOMET INFO:[0m     recall        : 0.3623021582733

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.184903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50353
[LightGBM] [Info] Number of data points in the train set: 58032, number of used features: 240
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.605641
[LightGBM] [Info] Start training from score -2.615017
[LightGBM] [Info] Start training from score -2.597739
[LightGBM] [Info] Start training from score -2.582349
[LightGBM] [Info] Start training from score -2.556251
[LightGBM] [Info] Start training from score -2.554919
[LightGBM] [Info] Start training from score -2.556251
[LightGBM] [Info] Start training from score -2.559587
[LightGBM] [Info] Start training from score -2.602612
[LightGBM] [Info] Start training from score -2.297978
[LightGBM] [Info] Start training from score -2.561595
[LightGBM] [Info] Start training from score -2.747809


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/afb886c252c7422a9276da6b02b3bf5e

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub5_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/afb886c252c7422a9276da6b02b3bf5e
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.36556821681552204
[1;38;5;39mCOMET INFO:[0m     precision     : 0.3820216012793683
[1;38;5;39mCOMET INFO:[0m     recall        : 0.365568216815

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.171181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50352
[LightGBM] [Info] Number of data points in the train set: 57651, number of used features: 240
[LightGBM] [Info] Start training from score -2.565105
[LightGBM] [Info] Start training from score -2.599987
[LightGBM] [Info] Start training from score -2.608430
[LightGBM] [Info] Start training from score -2.593934
[LightGBM] [Info] Start training from score -2.577359
[LightGBM] [Info] Start training from score -2.550775
[LightGBM] [Info] Start training from score -2.549442
[LightGBM] [Info] Start training from score -2.553000
[LightGBM] [Info] Start training from score -2.557467
[LightGBM] [Info] Start training from score -2.603028
[LightGBM] [Info] Start training from score -2.337911
[LightGBM] [Info] Start training from score -2.552555
[LightGBM] [Info] Start training from score -2.740953


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/e49c55cb8905471ba20471b46eed555b

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub6_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/e49c55cb8905471ba20471b46eed555b
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.23054545454545455
[1;38;5;39mCOMET INFO:[0m     precision     : 0.24365059194765656
[1;38;5;39mCOMET INFO:[0m     recall        : 0.23054545454

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50357
[LightGBM] [Info] Number of data points in the train set: 57734, number of used features: 240
[LightGBM] [Info] Start training from score -2.565192
[LightGBM] [Info] Start training from score -2.601193
[LightGBM] [Info] Start training from score -2.608927
[LightGBM] [Info] Start training from score -2.588894
[LightGBM] [Info] Start training from score -2.577657
[LightGBM] [Info] Start training from score -2.549549
[LightGBM] [Info] Start training from score -2.551325
[LightGBM] [Info] Start training from score -2.552436
[LightGBM] [Info] Start training from score -2.556893
[LightGBM] [Info] Start training from score -2.602828
[LightGBM] [Info] Start training from score -2.336120
[LightGBM] [Info] Start training from score -2.557340
[LightGBM] [Info] Start training from score -2.742123


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/a1492ab588bf4a58add62cbd04f7e3ca

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub7_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/a1492ab588bf4a58add62cbd04f7e3ca
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.2956419316843345
[1;38;5;39mCOMET INFO:[0m     precision     : 0.2967914975448593
[1;38;5;39mCOMET INFO:[0m     recall        : 0.2956419316843

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50348
[LightGBM] [Info] Number of data points in the train set: 57989, number of used features: 240
[LightGBM] [Info] Start training from score -2.564880
[LightGBM] [Info] Start training from score -2.605600
[LightGBM] [Info] Start training from score -2.614747
[LightGBM] [Info] Start training from score -2.595841
[LightGBM] [Info] Start training from score -2.581152
[LightGBM] [Info] Start training from score -2.555732
[LightGBM] [Info] Start training from score -2.555732
[LightGBM] [Info] Start training from score -2.558178
[LightGBM] [Info] Start training from score -2.562641
[LightGBM] [Info] Start training from score -2.604899
[LightGBM] [Info] Start training from score -2.296208
[LightGBM] [Info] Start training from score -2.561747
[LightGBM] [Info] Start training from score -2.745455


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/ceb5b49794c641bd89762e3e6178c3eb

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub8_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/ceb5b49794c641bd89762e3e6178c3eb
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.32491968793024323
[1;38;5;39mCOMET INFO:[0m     precision     : 0.3317811935824186
[1;38;5;39mCOMET INFO:[0m     recall        : 0.324919687930

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50337
[LightGBM] [Info] Number of data points in the train set: 58003, number of used features: 240
[LightGBM] [Info] Start training from score -2.565122
[LightGBM] [Info] Start training from score -2.605608
[LightGBM] [Info] Start training from score -2.616639
[LightGBM] [Info] Start training from score -2.597934
[LightGBM] [Info] Start training from score -2.582989
[LightGBM] [Info] Start training from score -2.554419
[LightGBM] [Info] Start training from score -2.554419
[LightGBM] [Info] Start training from score -2.557307
[LightGBM] [Info] Start training from score -2.560202
[LightGBM] [Info] Start training from score -2.607945
[LightGBM] [Info] Start training from score -2.295593
[LightGBM] [Info] Start training from score -2.558197
[LightGBM] [Info] Start training from score -2.747309


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/77dceb1d0ec5465cb0b90d53591d2e07

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub9_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/77dceb1d0ec5465cb0b90d53591d2e07
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.33389544688026984
[1;38;5;39mCOMET INFO:[0m     precision     : 0.38829491572171315
[1;38;5;39mCOMET INFO:[0m     recall        : 0.33389544688

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50343
[LightGBM] [Info] Number of data points in the train set: 57845, number of used features: 240
[LightGBM] [Info] Start training from score -2.565088
[LightGBM] [Info] Start training from score -2.602880
[LightGBM] [Info] Start training from score -2.613204
[LightGBM] [Info] Start training from score -2.595670
[LightGBM] [Info] Start training from score -2.577527
[LightGBM] [Info] Start training from score -2.551248
[LightGBM] [Info] Start training from score -2.551913
[LightGBM] [Info] Start training from score -2.554134
[LightGBM] [Info] Start training from score -2.554357
[LightGBM] [Info] Start training from score -2.599617
[LightGBM] [Info] Start training from score -2.323990
[LightGBM] [Info] Start training from score -2.559037
[LightGBM] [Info] Start training from score -2.745927


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/2a3d14e612414994a3234a2b769cadad

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub10_test
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/2a3d14e612414994a3234a2b769cadad
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.23304894476874718
[1;38;5;39mCOMET INFO:[0m     precision     : 0.23344892805828524
[1;38;5;39mCOMET INFO:[0m     recall        : 0.2330489447

<Figure size 640x480 with 0 Axes>

In [None]:
start_time = time.time()    # comienzo a medir el tiempo

# Crear una instancia de LGBMClassifier
clf = LGBMClassifier()

# Entrenar el clasificador
clf.fit(X_train, y_train, eval_metric='logloss')

# Finalizo la medida del tiempo y calculo el tiempo de entrenamiento
end_time = time.time()
training_time = end_time - start_time

# # Realizar predicciones en el conjunto de validación
# y_pred = clf.predict(X_val)

# # Calcular la precisión
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Accuracy: {accuracy}')

# Obtener las iteraciones del mejor modelo
best_iteration = clf.best_iteration_

# Imprimir la advertencia si existe
if hasattr(clf, 'best_iteration_') and clf.best_iteration_ is None:
    print("[LightGBM] [Warning] No further splits with positive gain, best gain: -inf")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.216863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50308
[LightGBM] [Info] Number of data points in the train set: 37655, number of used features: 240
[LightGBM] [Info] Start training from score -2.565481
[LightGBM] [Info] Start training from score -2.592793
[LightGBM] [Info] Start training from score -2.601708
[LightGBM] [Info] Start training from score -2.591020
[LightGBM] [Info] Start training from score -2.573805
[LightGBM] [Info] Start training from score -2.543614
[LightGBM] [Info] Start training from score -2.547339
[LightGBM] [Info] Start training from score -2.549036
[LightGBM] [Info] Start training from score -2.549036
[LightGBM] [Info] Start training from score -2.601708
[LightGBM] [Info] Start training from score -2.376846
[LightGBM] [Info] Start training from score -2.547339
[LightGBM] [Info] Start training from score -2.740986
Accurac

Voy a guardar el clasificador, y evaluar distintas métricas: accuracy, precision y recall. Voy a hacer una matriz de confusión.
Por otra parte, voy a guardar la partición de los datos para hacer reproducible el experimento.

In [None]:
# Guardar el modelo entrenado en un archivo
joblib.dump(bst, 'baseline_gbm_sep_sub_r1.pkl')

# Predecir en el conjunto de test
y_pred = bst.predict(X_test)

# Calcular métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Mostrar las métricas
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Visualizar la matriz de confusión
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Guardar la imagen de la matriz de confusión
plt.savefig("confusion_matrix.png")

Voy a guardar las métricas calculadas en un experimento en Comet

In [None]:
# Conectar con Comet
API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO' # Se puede encontrar en Settings(Arriba a la derecha en Comet)

# Crear un experimento con mi API KEY
exp = Experiment(api_key=API_KEY,
                 project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                 auto_param_logging=False)
exp.set_name('baseline_gbm_sep_sub_r1') # Nombre de este experimento
exp.add_tags(['baseline', 'gbm', 'sep_sub']) # Tags

exp.log_metric("accuracy", accuracy)
exp.log_metric("precision", precision)
exp.log_metric("recall", recall)
exp.log_metric("training_time", training_time)
exp.log_confusion_matrix(y_test, y_pred)
exp.log_parameter("partition_array", sorteo)   # Guarda el arreglo en el experimento
exp.log_text("Primeros dos sujetos --> test, tercero y cuarto --> validación, resto --> train. \n Corresponde a la primera ronda que entreno con estos parámetros. ")   # Comentario del experimento

In [None]:
# Subir el modelo
exp.log_model(name="baseline_gbm_sep_sub_r1", file_or_folder="baseline_gbm_sep_sub_r1.pkl")
exp.end()

Junto todo en un bloque de código

In [None]:
%cd "/content/drive/My Drive/repo_tesis/archivos_generados_codigos"

for k in range(1,2):
  label = np.array(label)
  cantSujetos = np.max(label[:, 2])
  sujeto_test = k
  sorteo = np.random.permutation(cantSujetos) + 1
  sorteo_sin_test = np.delete(sorteo, np.where(sorteo == sujeto_test))
  indices_test = list(np.where(label[:, 2]==sujeto_test)[0])
  indices_val = list(np.where(label[:, 2]==sorteo_sin_test[0])[0])
  indices_val.extend(list(np.where(label[:, 2]==sorteo_sin_test[1])[0]))

  indices_train = []
  for j in sorteo_sin_test[2:]:
      indices_train.extend(np.where(label[:, 2]==j)[0])
  features = np.array(features)

  X_train = features[indices_train, :]
  y_train = label[indices_train, 1]
  X_val = features[indices_val, :]
  y_val = label[indices_val, 1]
  X_test =  features[indices_test, :]
  y_test = label[indices_test, 1]

  start_time = time.time()

  # implemento gbm
  clf = LGBMClassifier()
  clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss')

  end_time = time.time()
  training_time = end_time - start_time
  # Obtener las iteraciones del mejor modelo
  bst = clf.best_iteration_

  nombre = 'baseline_gbm_sep_sub' + str(k) + '_testing'

  joblib.dump(clf, nombre + '.pkl')

  y_pred = clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  conf_matrix = confusion_matrix(y_test, y_pred)
  plt.savefig("confusion_matrix.png")

  API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO'

  exp = Experiment(api_key=API_KEY,
                  project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                  auto_param_logging=False)
  exp.set_name(nombre) # Nombre de este experimento
  exp.add_tags(['baseline', 'gbm', 'sep_sub', 'choose_test']) # Tags

  exp.log_metric("accuracy", accuracy)
  exp.log_metric("precision", precision)
  exp.log_metric("recall", recall)
  exp.log_metric("training_time", training_time)
  exp.log_confusion_matrix(y_test, y_pred)
  exp.log_parameter("partition_array", sorteo)   # Guarda el arreglo en el experimento
  exp.log_text("Se fija el sujeto de test y se sortea el resto. Los primeros dos sujetos del sorteo son los de validación.")   # Comentario del experimento
  exp.log_model(name=nombre, file_or_folder=nombre + '.pkl')
  exp.end()

/content/drive/My Drive/repo_tesis/archivos_generados_codigos
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.541994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50345
[LightGBM] [Info] Number of data points in the train set: 45226, number of used features: 240
[LightGBM] [Info] Start training from score -2.565215
[LightGBM] [Info] Start training from score -2.600029
[LightGBM] [Info] Start training from score -2.605103
[LightGBM] [Info] Start training from score -2.578238
[LightGBM] [Info] Start training from score -2.572718
[LightGBM] [Info] Start training from score -2.544725
[LightGBM] [Info] Start training from score -2.544161
[LightGBM] [Info] Start training from score -2.547263
[LightGBM] [Info] Start training from score -2.552927
[LightGBM] [Info] Start training from score -2.595573
[LightGBM] [Info] Start training from score -2.382079
[LightGBM] [Info] Start training from score -2.553779

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/bd0cddec9e424fa4bd23561cf03d600f

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/bd0cddec9e424fa4bd23561cf03d600f
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.25368837711406983
[1;38;5;39mCOMET INFO:[0m     precision     : 0.26482242684700047
[1;38;5;39mCOMET INFO:[0m     recall        : 0.25368837711406983
[1;38;5;39mCOMET INFO:[0m     training_time : 113.34423160552979
[1;38;5;

<Figure size 640x480 with 0 Axes>