In [1]:
import importlib

import training
import utils
import training_constants as tc
import preprocessing as pre
# Avaliar e selecionar o melhor seletor de variáveis
from sklearn.pipeline import Pipeline
import feature_selection as fs
import evaluation



In [2]:
importlib.reload(training)
importlib.reload(utils)
importlib.reload(tc)
importlib.reload(pre)
importlib.reload(fs)
importlib.reload(training)
importlib.reload(evaluation)


<module 'evaluation' from '/Users/patricia/Documents/code/python/behavior-detection/src/evaluation.py'>

In [3]:
import os
import warnings
import numpy as np

# Configurar variáveis de ambiente para não mostrar warning da Intel MKL
os.environ["MKL_CBWR"] = "AUTO"
os.environ["MKL_VERBOSE"] = "0"
os.environ["NUMPY_MKL_ERROR"] = "IGNORE"
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["MKL_DYNAMIC"] = "FALSE"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_WARNINGS"] = "FALSE"

# Suprimir avisos de depreciação
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
X, y = pre.load_data("/Users/patricia/Documents/code/python/behavior-detection/data/new_logs_labels.csv")
y.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5489 entries, 0 to 5524
Columns: 336 entries, id_log to misc_EqSec_OpFrac_MMC_MtNumerador_total
dtypes: float64(82), int64(253), object(1)
memory usage: 14.1+ MB
<class 'pandas.core.series.Series'>
Index: 5489 entries, 0 to 5524
Series name: comportamento
Non-Null Count  Dtype 
--------------  ----- 
5489 non-null   object
dtypes: object(1)
memory usage: 85.8+ KB


0    ON TASK
1    ON TASK
2    ON TASK
3    ON TASK
4    ON TASK
Name: comportamento, dtype: object

In [5]:
test_size = 0.2  # 80% for training, 20% for testing
X_train, X_test, y_train, y_test = pre.split_train_test_data(X, y, test_size, random_state=42)


Tamanho do conjunto de treino: 4391
Tamanho do conjunto de teste: 1098


In [6]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train).value_counts())
print("\n\nNro de instancias de cada classe em y_test:\n")
print(pd.Series(y_test).value_counts())

Nro de instancias de cada classe em y_train:

ON TASK                 2553
ON SYSTEM                721
OFF TASK                 495
ON TASK CONVERSATION     321
ON TASK OUT              301
Name: count, dtype: int64


Nro de instancias de cada classe em y_test:

ON TASK                 606
ON SYSTEM               186
OFF TASK                134
ON TASK CONVERSATION     93
ON TASK OUT              79
Name: count, dtype: int64


In [7]:
y_train, label_encoder = pre.encode_labels(y_train)
y_test = label_encoder.transform(y_test)

In [8]:
# Pré-processar os dados uma vez
preprocessor = pre.create_preprocessor(X_train)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [9]:
best_selector, best_selector_name = fs.evaluate_feature_selectors(X_train_preprocessed, y_train, n_features_to_select=10, n_components=10)

# Exibir o melhor seletor e as características selecionadas
print(f'O melhor seletor de variáveis é: {best_selector_name}')



testando os 3 seletores criados
O melhor seletor de variáveis é: RandomForest


In [10]:
# Adicionar o melhor seletor de variáveis no pipeline de treinamento
base_pipeline = Pipeline([
    ('feature_selection', best_selector)
])

In [11]:
# Bayesian optimization não está funcionando corretamente. Usar Random Search ou Grid Search
trained_models = training.train_model(X_train_preprocessed, y_train, tc.RANDOM_SEARCH, base_pipeline)



Training and evaluating Logistic Regression with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits




RandomSearch Best Result for Logistic Regression: 0.692488052108526

Training and evaluating Decision Tree with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch Best Result for Decision Tree: 0.7755671894585074

Training and evaluating Random Forest with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch Best Result for Random Forest: 0.8224043285950351

Training and evaluating Gradient Boosting with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits


115 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/patricia/anaconda3/envs/projetos_ML/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/patricia/anaconda3/envs/projetos_ML/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/patricia/anaconda3/envs/projetos_ML/lib/python3.10/site-packages/sklearn/ensemble/_gb.py", line 420, in fit
    self._validate_params()
  File "/Users/patricia/anaconda3/envs/projetos_ML/lib/

RandomSearch Best Result for Gradient Boosting: 0.8289604627194223

Training and evaluating SVM with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch Best Result for SVM: 0.7608152696386042

Training and evaluating KNN with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch Best Result for KNN: 0.7901479393656958

Training and evaluating XGBoost with RandomSearch:
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomSearch Best Result for XGBoost: 0.8307879163365086


In [12]:
dirpath = "/Users/patricia/Documents/code/python/behavior-detection/output/"

reports = evaluation.generate_reports(trained_models, X_train_preprocessed, y_train, X_test_preprocessed, y_test)
print(evaluation.print_reports(reports, dirpath))
evaluation.save_reports_to_csv(reports, dirpath)


Evaluating Logistic Regression with RandomSearch:
Hiperparâmetros: {'memory': None, 'steps': [('classifier', LogisticRegression(max_iter=5000))], 'verbose': False, 'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 5000, 'classifier__multi_class': 'auto', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}

Training set report:
Class 0 - Precision: 0.79, Recall: 0.55, F1-Score: 0.65, Support: 495.0
Class 1 - Precision: 0.87, Recall: 0.79, F1-Score: 0.83, Support: 721.0
Class 2 - Precision: 0.86, Recall: 0.93, F1-Score: 0.9, Support: 2553.0
Class 3 - Precision: 0.81, Recall: 0.85, F1-Score: 0.83, Support: 321.0
Class 4 - Precision