El objetivo de este código es implementar el algoritmo GBM, separando los datos por sujeto.
Va a buscar los datos en una carpeta que se encuentre en el mismo directorio que se llame *datos_baseline*.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/repo_tesis/entorno_tesis_Molina"
!source bin/activate

Mounted at /content/drive
/content/drive/My Drive/repo_tesis/entorno_tesis_Molina


In [None]:
# instalar comet ml en caso que sea necesario
!pip install comet_ml

In [None]:
# Importo las librerías
from lightgbm import LGBMClassifier
import numpy as np
from joblib import load
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import time
from comet_ml import Experiment
import joblib
import matplotlib.pyplot as plt

In [None]:
# me situo en el directorio
%cd "/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline"

# Levanto los datos
features = load("FEATURES_W200_I50_5050.joblib")
label = load("label_W200_I50_5050.joblib")

features_array = np.array(features)
label_array = np.array(label)

/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline


In [None]:
# me situo en el directorio
%cd "/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline"

# Levanto particiones
folds_ind_lista = []
sujetos_array_lista = []

for i in range(0,10):
  folds_ind_lista.append(joblib.load('folds_W200_I50_sujetoTest_' + str(i+1) + '.pkl'))
  sujetos_array_lista.append(joblib.load('arraySujetos_folds_W200_I50_sujetoTest_' + str(i+1) + '.pkl'))

# Aclaracion: sujetos_array va de 0 a 9. Por lo que el sujeto 0 en realidad es el 1 y asi sucesivamente

/content/drive/My Drive/repo_tesis/src/baseline/datos_baseline


In [None]:
# agrego una función para transformar a 1 todas las clases distintas de 0, es decir, todas las que no sean gesto
def go_to_binary_problem(label):
  label_copy = label.copy()

  ind_gestos = np.where(label_copy[:, 1]!=0)[0]
  label_copy[ind_gestos, 1] = 1
  print('Proporcion clase 1: '+str(len(ind_gestos)/len(label_copy)))
  return label_copy

label_array = go_to_binary_problem(label_array)

Proporcion clase 1: 0.5


In [None]:
for j in range(0,10):

  # Particion de datos de esta iteracion
  folds_ind = folds_ind_lista[j]
  sujetos_array = sujetos_array_lista[j]

  # Voy a usar como sujeto (o fold, es lo mismo) de test el que aparece primero. Mirar primer elemento de sujetos_array, y tener presente
  # la aclaracion hecha en el bloque de codigo anterior

  # por el criterio elegido, se que el primer fold (o sujeto) es para test, asi que lo saco
  folds_ind_test = folds_ind[0]

  # Uno los datos de los folds de entrenamiento
  # me quedo con los folds (o sujetos) de train quitando el primero (posicion 0)
  folds_ind_train = folds_ind[1:]
  # concateno los arrays (folds) en uno solo
  folds_ind_train_array = np.concatenate(folds_ind_train)
  # aplano el array resultante
  folds_ind_train_array = folds_ind_train_array.ravel()

  # Si tuviera mas de un fold de test tendria que hacer lo mismo que hice recien para train, pero no lo hago porque solo tengo uno

  # Defino X_train, y_train, X_test, y_test
  X_train = features_array[folds_ind_train_array, :]
  y_train = label_array[folds_ind_train_array, 1]
  X_test = features_array[folds_ind_test, :]
  y_test = label_array[folds_ind_test, 1]

  start_time = time.time()    # comienzo a medir el tiempo
  # implemento gbm
  clf = LGBMClassifier()
  clf.fit(X_train, y_train, eval_metric='logloss')

  end_time = time.time()
  training_time = end_time - start_time
  # Obtener las iteraciones del mejor modelo
  bst = clf.best_iteration_

  nombre = 'baseline_gbm_sep_sub' + str(j+1) + '_test_50_50'

  joblib.dump(clf, nombre + '.pkl')

  y_pred = clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  conf_matrix = confusion_matrix(y_test, y_pred)
  plt.savefig("confusion_matrix.png")

  API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO'

  exp = Experiment(api_key=API_KEY,
                  project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                  auto_param_logging=False)
  exp.set_name(nombre) # Nombre de este experimento
  exp.add_tags(['baseline', 'gbm', 'sep_sub', 'choose_test', '50_50']) # Tags

  exp.log_metric("accuracy", accuracy)
  exp.log_metric("precision", precision)
  exp.log_metric("recall", recall)
  exp.log_metric("training_time", training_time)
  exp.log_confusion_matrix(y_test, y_pred)
  exp.log_parameter("partition_array", sujetos_array)   # Guarda el arreglo en el experimento
  exp.log_text("El primer sujeto de partition_array es el de test, el resto son todos de train. Se usó la particion que se guarda en datos_baseline")   # Comentario del experimento
  exp.log_model(name=nombre, file_or_folder=nombre + '.pkl')
  exp.end()

[LightGBM] [Info] Number of positive: 29475, number of negative: 29493
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.210547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50349
[LightGBM] [Info] Number of data points in the train set: 58968, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499847 -> initscore=-0.000611
[LightGBM] [Info] Start training from score -0.000611


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/b6377b95e6914f869f1b7dc869dfd6ad

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub1_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/b6377b95e6914f869f1b7dc869dfd6ad
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8924073407700611
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8945788245495759
[1;38;5;39mCOMET INFO:[0m     recall        : 0.8924073

[LightGBM] [Info] Number of positive: 29000, number of negative: 28963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.298524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50348
[LightGBM] [Info] Number of data points in the train set: 57963, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500319 -> initscore=0.001277
[LightGBM] [Info] Start training from score 0.001277


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/e205ad18272f45f6876bfb5e0fa89a1a

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub2_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/e205ad18272f45f6876bfb5e0fa89a1a
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.9032454670120372
[1;38;5;39mCOMET INFO:[0m     precision     : 0.9077125484253887
[1;38;5;39mCOMET INFO:[0m     recall        : 0.9032454

[LightGBM] [Info] Number of positive: 29531, number of negative: 29442
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.327145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50352
[LightGBM] [Info] Number of data points in the train set: 58973, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500755 -> initscore=0.003018
[LightGBM] [Info] Start training from score 0.003018


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/f3facf21fb534c09a5872910574f5f06

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub3_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/f3facf21fb534c09a5872910574f5f06
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.9101386637853413
[1;38;5;39mCOMET INFO:[0m     precision     : 0.9181751673178187
[1;38;5;39mCOMET INFO:[0m     recall        : 0.9101386

[LightGBM] [Info] Number of positive: 28808, number of negative: 28768
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.180330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50352
[LightGBM] [Info] Number of data points in the train set: 57576, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500347 -> initscore=0.001389
[LightGBM] [Info] Start training from score 0.001389


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/995e357608914196a088200af22c2551

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub4_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/995e357608914196a088200af22c2551
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8309352517985612
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8309877370827223
[1;38;5;39mCOMET INFO:[0m     recall        : 0.8309352

[LightGBM] [Info] Number of positive: 29035, number of negative: 28997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.184417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50352
[LightGBM] [Info] Number of data points in the train set: 58032, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500327 -> initscore=0.001310
[LightGBM] [Info] Start training from score 0.001310


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/4b4e6e862d134f4b9781a6085dbdca5a

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub5_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/4b4e6e862d134f4b9781a6085dbdca5a
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8475515860794579
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8565116160451874
[1;38;5;39mCOMET INFO:[0m     recall        : 0.8475515

[LightGBM] [Info] Number of positive: 28834, number of negative: 28817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.172917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50355
[LightGBM] [Info] Number of data points in the train set: 57651, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500147 -> initscore=0.000590
[LightGBM] [Info] Start training from score 0.000590


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/3f680a6a64504061ac70e1e151edb2a4

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub6_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/3f680a6a64504061ac70e1e151edb2a4
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8408727272727273
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8439809908788334
[1;38;5;39mCOMET INFO:[0m     recall        : 0.8408727

[LightGBM] [Info] Number of positive: 28867, number of negative: 28867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.173665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50351
[LightGBM] [Info] Number of data points in the train set: 57734, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/855177581d1c49c9a168c38b711ebaa8

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub7_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/855177581d1c49c9a168c38b711ebaa8
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8127208480565371
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8273344719211719
[1;38;5;39mCOMET INFO:[0m     recall        : 0.8127208

[LightGBM] [Info] Number of positive: 29064, number of negative: 28925
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.318039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50351
[LightGBM] [Info] Number of data points in the train set: 57989, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501199 -> initscore=0.004794
[LightGBM] [Info] Start training from score 0.004794


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/080c8a5090404107a22505b53a43eb8f

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub8_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/080c8a5090404107a22505b53a43eb8f
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.830656264341441
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8314540276761337
[1;38;5;39mCOMET INFO:[0m     recall        : 0.83065626

[LightGBM] [Info] Number of positive: 29003, number of negative: 29000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.172002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50342
[LightGBM] [Info] Number of data points in the train set: 58003, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500026 -> initscore=0.000103
[LightGBM] [Info] Start training from score 0.000103


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/33218e2a3f9b40009be95811d7143459

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub9_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/33218e2a3f9b40009be95811d7143459
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.7901272420665338
[1;38;5;39mCOMET INFO:[0m     precision     : 0.7939417011359163
[1;38;5;39mCOMET INFO:[0m     recall        : 0.7901272

[LightGBM] [Info] Number of positive: 28957, number of negative: 28888
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50351
[LightGBM] [Info] Number of data points in the train set: 57845, number of used features: 240
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500596 -> initscore=0.002386
[LightGBM] [Info] Start training from score 0.002386


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/f0a4277feb2c4e87a212cd5414e47024

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : baseline_gbm_sep_sub10_test_50_50
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/f0a4277feb2c4e87a212cd5414e47024
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.8480766352342464
[1;38;5;39mCOMET INFO:[0m     precision     : 0.8520604196479148
[1;38;5;39mCOMET INFO:[0m     recall        : 0.848076

<Figure size 640x480 with 0 Axes>

In [None]:
start_time = time.time()    # comienzo a medir el tiempo

# Crear una instancia de LGBMClassifier
clf = LGBMClassifier()

# Entrenar el clasificador
clf.fit(X_train, y_train, eval_metric='logloss')

# Finalizo la medida del tiempo y calculo el tiempo de entrenamiento
end_time = time.time()
training_time = end_time - start_time

# # Realizar predicciones en el conjunto de validación
# y_pred = clf.predict(X_val)

# # Calcular la precisión
# accuracy = accuracy_score(y_val, y_pred)
# print(f'Accuracy: {accuracy}')

# Obtener las iteraciones del mejor modelo
best_iteration = clf.best_iteration_

# Imprimir la advertencia si existe
if hasattr(clf, 'best_iteration_') and clf.best_iteration_ is None:
    print("[LightGBM] [Warning] No further splits with positive gain, best gain: -inf")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.216863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50308
[LightGBM] [Info] Number of data points in the train set: 37655, number of used features: 240
[LightGBM] [Info] Start training from score -2.565481
[LightGBM] [Info] Start training from score -2.592793
[LightGBM] [Info] Start training from score -2.601708
[LightGBM] [Info] Start training from score -2.591020
[LightGBM] [Info] Start training from score -2.573805
[LightGBM] [Info] Start training from score -2.543614
[LightGBM] [Info] Start training from score -2.547339
[LightGBM] [Info] Start training from score -2.549036
[LightGBM] [Info] Start training from score -2.549036
[LightGBM] [Info] Start training from score -2.601708
[LightGBM] [Info] Start training from score -2.376846
[LightGBM] [Info] Start training from score -2.547339
[LightGBM] [Info] Start training from score -2.740986
Accurac

Voy a guardar el clasificador, y evaluar distintas métricas: accuracy, precision y recall. Voy a hacer una matriz de confusión.
Por otra parte, voy a guardar la partición de los datos para hacer reproducible el experimento.

In [None]:
# Guardar el modelo entrenado en un archivo
joblib.dump(bst, 'baseline_gbm_sep_sub_r1.pkl')

# Predecir en el conjunto de test
y_pred = bst.predict(X_test)

# Calcular métricas de desempeño
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Mostrar las métricas
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Visualizar la matriz de confusión
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Guardar la imagen de la matriz de confusión
plt.savefig("confusion_matrix.png")

Voy a guardar las métricas calculadas en un experimento en Comet

In [None]:
# Conectar con Comet
API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO' # Se puede encontrar en Settings(Arriba a la derecha en Comet)

# Crear un experimento con mi API KEY
exp = Experiment(api_key=API_KEY,
                 project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                 auto_param_logging=False)
exp.set_name('baseline_gbm_sep_sub_r1') # Nombre de este experimento
exp.add_tags(['baseline', 'gbm', 'sep_sub']) # Tags

exp.log_metric("accuracy", accuracy)
exp.log_metric("precision", precision)
exp.log_metric("recall", recall)
exp.log_metric("training_time", training_time)
exp.log_confusion_matrix(y_test, y_pred)
exp.log_parameter("partition_array", sorteo)   # Guarda el arreglo en el experimento
exp.log_text("Primeros dos sujetos --> test, tercero y cuarto --> validación, resto --> train. \n Corresponde a la primera ronda que entreno con estos parámetros. ")   # Comentario del experimento

In [None]:
# Subir el modelo
exp.log_model(name="baseline_gbm_sep_sub_r1", file_or_folder="baseline_gbm_sep_sub_r1.pkl")
exp.end()

Junto todo en un bloque de código

In [None]:
%cd "/content/drive/My Drive/repo_tesis/archivos_generados_codigos"

for k in range(1,2):
  label = np.array(label)
  cantSujetos = np.max(label[:, 2])
  sujeto_test = k
  sorteo = np.random.permutation(cantSujetos) + 1
  sorteo_sin_test = np.delete(sorteo, np.where(sorteo == sujeto_test))
  indices_test = list(np.where(label[:, 2]==sujeto_test)[0])
  indices_val = list(np.where(label[:, 2]==sorteo_sin_test[0])[0])
  indices_val.extend(list(np.where(label[:, 2]==sorteo_sin_test[1])[0]))

  indices_train = []
  for j in sorteo_sin_test[2:]:
      indices_train.extend(np.where(label[:, 2]==j)[0])
  features = np.array(features)

  X_train = features[indices_train, :]
  y_train = label[indices_train, 1]
  X_val = features[indices_val, :]
  y_val = label[indices_val, 1]
  X_test =  features[indices_test, :]
  y_test = label[indices_test, 1]

  start_time = time.time()

  # implemento gbm
  clf = LGBMClassifier()
  clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss')

  end_time = time.time()
  training_time = end_time - start_time
  # Obtener las iteraciones del mejor modelo
  bst = clf.best_iteration_

  nombre = 'baseline_gbm_sep_sub' + str(k) + '_testing'

  joblib.dump(clf, nombre + '.pkl')

  y_pred = clf.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  conf_matrix = confusion_matrix(y_test, y_pred)
  plt.savefig("confusion_matrix.png")

  API_KEY = 'ehXeElNypcj7Knar5zTmyjwSO'

  exp = Experiment(api_key=API_KEY,
                  project_name='tesis-experimentos', # Nombre del proyecto donde se registran los experimentos
                  auto_param_logging=False)
  exp.set_name(nombre) # Nombre de este experimento
  exp.add_tags(['baseline', 'gbm', 'sep_sub', 'choose_test']) # Tags

  exp.log_metric("accuracy", accuracy)
  exp.log_metric("precision", precision)
  exp.log_metric("recall", recall)
  exp.log_metric("training_time", training_time)
  exp.log_confusion_matrix(y_test, y_pred)
  exp.log_parameter("partition_array", sorteo)   # Guarda el arreglo en el experimento
  exp.log_text("Se fija el sujeto de test y se sortea el resto. Los primeros dos sujetos del sorteo son los de validación.")   # Comentario del experimento
  exp.log_model(name=nombre, file_or_folder=nombre + '.pkl')
  exp.end()

/content/drive/My Drive/repo_tesis/archivos_generados_codigos
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.541994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50345
[LightGBM] [Info] Number of data points in the train set: 45226, number of used features: 240
[LightGBM] [Info] Start training from score -2.565215
[LightGBM] [Info] Start training from score -2.600029
[LightGBM] [Info] Start training from score -2.605103
[LightGBM] [Info] Start training from score -2.578238
[LightGBM] [Info] Start training from score -2.572718
[LightGBM] [Info] Start training from score -2.544725
[LightGBM] [Info] Start training from score -2.544161
[LightGBM] [Info] Start training from score -2.547263
[LightGBM] [Info] Start training from score -2.552927
[LightGBM] [Info] Start training from score -2.595573
[LightGBM] [Info] Start training from score -2.382079
[LightGBM] [Info] Start training from score -2.553779

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/manuelmolinach99/tesis-experimentos/bd0cddec9e424fa4bd23561cf03d600f

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/manuelmolinach99/tesis-experimentos/bd0cddec9e424fa4bd23561cf03d600f
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy      : 0.25368837711406983
[1;38;5;39mCOMET INFO:[0m     precision     : 0.26482242684700047
[1;38;5;39mCOMET INFO:[0m     recall        : 0.25368837711406983
[1;38;5;39mCOMET INFO:[0m     training_time : 113.34423160552979
[1;38;5;

<Figure size 640x480 with 0 Axes>