In [1]:
import os
os.environ['MKL_ENABLE_INSTRUCTIONS'] = 'SSE4_2'

In [2]:
import training
import training_constants as tc
import preprocessing as pre
from sklearn.pipeline import Pipeline
import feature_selection as fs
import evaluation



In [3]:
X, y = pre.load_data("/Users/patricia/Documents/code/python/behavior-detection/data/new_logs_labels.csv")
y.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5489 entries, 0 to 5524
Columns: 336 entries, id_log to misc_EqSec_OpFrac_MMC_MtNumerador_total
dtypes: float64(82), int64(253), object(1)
memory usage: 14.1+ MB
<class 'pandas.core.series.Series'>
Index: 5489 entries, 0 to 5524
Series name: comportamento
Non-Null Count  Dtype 
--------------  ----- 
5489 non-null   object
dtypes: object(1)
memory usage: 85.8+ KB


0    ON TASK
1    ON TASK
2    ON TASK
3    ON TASK
4    ON TASK
Name: comportamento, dtype: object

In [4]:
test_size = 0.2  # 80% for training, 20% for testing
X_train, X_test, y_train, y_test = pre.split_train_test_data(X, y, test_size, random_state=42)


Tamanho do conjunto de treino: 4391
Tamanho do conjunto de teste: 1098


In [5]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train).value_counts())
print("\n\nNro de instancias de cada classe em y_test:\n")
print(pd.Series(y_test).value_counts())

Nro de instancias de cada classe em y_train:

ON TASK                 2553
ON SYSTEM                721
OFF TASK                 495
ON TASK CONVERSATION     321
ON TASK OUT              301
Name: count, dtype: int64


Nro de instancias de cada classe em y_test:

ON TASK                 606
ON SYSTEM               186
OFF TASK                134
ON TASK CONVERSATION     93
ON TASK OUT              79
Name: count, dtype: int64


In [6]:
y_train, label_encoder = pre.encode_labels(y_train)
y_test = label_encoder.transform(y_test)

In [7]:
# Pré-processar os dados uma vez
preprocessor = pre.create_preprocessor(X_train)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [8]:
print(X_train_preprocessed[:5])

[[0.65819387 0.75862069 0.77777778 ... 0.         0.         0.        ]
 [0.99729108 0.68965517 0.55555556 ... 1.         0.         0.        ]
 [0.99854326 0.89655172 0.         ... 1.         0.         0.        ]
 [0.99462421 0.20689655 0.88888889 ... 1.         0.         0.        ]
 [0.99504288 0.27586207 1.         ... 1.         0.         0.        ]]


In [9]:
selectors = fs.get_feature_selectors(X_train_preprocessed, y_train)

# Executar a busca e obter o melhor seletor, parâmetros e score
# Avaliar múltiplos seletores com otimização
best_selector, best_params, best_score = fs.evaluate_multiple_selectors_with_search(X_train_preprocessed, y_train, selectors)

print(f"Best selector: {best_selector}")
print(f"Best params: {best_params}")
print(f"Best score: {best_score}")

# Extrair o melhor número de features
if 'feature_selection__n_features' in best_params:
    n_features_to_select = best_params['feature_selection__n_features']
elif 'feature_selection__n_components' in best_params:
    n_features_to_select = best_params['feature_selection__n_components']
else:
    n_features_to_select = None  # Outros métodos podem não ter n_features

print(f"Best number of features: {n_features_to_select}")


Evaluating selector: RFE
Evaluating selector: PCA
Evaluating selector: RF
Best selector: RF with params: {}
Best selector: SelectFromModel(estimator=RandomForestClassifier(random_state=0))
Best params: {}
Best score: 0.8218249958524122
Best number of features: None


In [10]:
# Adicionar o melhor seletor de variáveis no pipeline de treinamento
base_pipeline = Pipeline([
    ('feature_selection', best_selector)
])

In [11]:
# Bayesian optimization não está funcionando corretamente. Usar Random Search ou Grid Search
trained_models = training.train_model(X_train_preprocessed, y_train, tc.BAYESIAN_OPTIMIZATION, base_pipeline, n_iter=100, cv=5)



Training and evaluating Logistic Regression with Bayesian Optimization:




In [None]:
dirpath = "/Users/patricia/Documents/code/python/behavior-detection/output/"

reports = evaluation.generate_reports(trained_models, X_train_preprocessed, y_train, X_test_preprocessed, y_test)
print(evaluation.print_reports(reports, dirpath))
evaluation.save_reports_to_csv(reports, dirpath)