In [1]:
import preprocessing as pre

In [2]:
# No notebook
data_path = '../data/new_logs_labels.csv'
X, y = pre.load_data(data_path)
y.head()

0    ON TASK
1    ON TASK
2    ON TASK
3    ON TASK
4    ON TASK
Name: comportamento, dtype: object

In [3]:
X = X.fillna('missing')


In [4]:
test_size = 0.2  # 80% for training, 20% for testing
X_train, X_test, y_train, y_test = pre.split_train_test_data(X, y, test_size, random_state=42)


Tamanho do conjunto de treino: 4391
Tamanho do conjunto de teste: 1098


In [5]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train).value_counts())
print("\n\nNro de instancias de cada classe em y_test:\n")
print(pd.Series(y_test).value_counts())

Nro de instancias de cada classe em y_train:

ON TASK                 2553
ON SYSTEM                721
OFF TASK                 495
ON TASK CONVERSATION     321
ON TASK OUT              301
Name: count, dtype: int64


Nro de instancias de cada classe em y_test:

ON TASK                 606
ON SYSTEM               186
OFF TASK                134
ON TASK CONVERSATION     93
ON TASK OUT              79
Name: count, dtype: int64


In [6]:
non_numeric_cols_train = X_train.select_dtypes(exclude=['float', 'int']).columns
non_numeric_cols_test = X_test.select_dtypes(exclude=['float', 'int']).columns

print("Non-numeric columns in X_train:")
print(non_numeric_cols_train)

print("\nNon-numeric columns in X_test:")
print(non_numeric_cols_test)

Non-numeric columns in X_train:
Index(['log_type'], dtype='object')

Non-numeric columns in X_test:
Index(['log_type'], dtype='object')


In [7]:
X_train, label_encoders = pre.encode_categorical_columns(X_train)
X_test = pre.apply_encoders_to_test_data(X_test, label_encoders)

In [8]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train).value_counts())
print("\n\nNro de instancias de cada classe em y_test:\n")
print(pd.Series(y_test).value_counts())

Nro de instancias de cada classe em y_train:

ON TASK                 2553
ON SYSTEM                721
OFF TASK                 495
ON TASK CONVERSATION     321
ON TASK OUT              301
Name: count, dtype: int64


Nro de instancias de cada classe em y_test:

ON TASK                 606
ON SYSTEM               186
OFF TASK                134
ON TASK CONVERSATION     93
ON TASK OUT              79
Name: count, dtype: int64


In [9]:
y_train, label_encoder = pre.encode_single_column(y_train)
y_test = label_encoder.transform(y_test)

In [10]:
X_train_over, y_train_over = pre.apply_smote(X_train, y_train)

In [11]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train_over).value_counts())


Nro de instancias de cada classe em y_train:

1    2553
2    2553
0    2553
3    2553
4    2553
Name: count, dtype: int64


In [12]:
# Visualizar os tipos das colunas de X_train_over
print("Tipos das colunas de X_train_over:")
x_train_types = X_train_over.dtypes

# Visualizar os tipos das colunas de X_test
print("\nTipos das colunas de X_test:")
X_test_types = X_test.dtypes

Tipos das colunas de X_train_over:

Tipos das colunas de X_test:


In [13]:
# Pré-processar os dados uma vez
preprocessor = pre.create_preprocessor(X_train_over)
X_train_preprocessed = preprocessor.fit_transform(X_train_over)
X_test_preprocessed = preprocessor.transform(X_test)

In [14]:
print(X_train_preprocessed[:5])

[[0.65819387 0.75862069 0.77777778 ... 0.         0.         0.        ]
 [0.99729108 0.68965517 0.55555556 ... 0.         0.         0.        ]
 [0.99854326 0.89655172 0.         ... 0.         0.         0.        ]
 [0.99462421 0.20689655 0.88888889 ... 0.         0.         0.        ]
 [0.99504288 0.27586207 1.         ... 0.         0.         0.        ]]


In [15]:
print(X_train_preprocessed.shape)
import pandas as pd

# Supondo que 'X_train_preprocessed' seja seu numpy.ndarray
df = pd.DataFrame(X_train_preprocessed)

# Agora você pode chamar .describe() no DataFrame
print(df.describe())

(12765, 336)
                0             1             2             3             4    \
count  12765.000000  12765.000000  12765.000000  12765.000000  12765.000000   
mean       0.824813      0.541910      0.520999      0.546390      0.339597   
std        0.282219      0.291098      0.311387      0.310065      0.196976   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.660264      0.310345      0.222222      0.333333      0.174377   
50%        0.995366      0.551724      0.555556      0.555556      0.341637   
75%        0.997872      0.793103      0.777778      0.777778      0.487544   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

                5             6             7             8             9    \
count  12765.000000  12765.000000  12765.000000  12765.000000  12765.000000   
mean       0.653858      0.022170      0.001488      0.023188      0.069719   
std        0.254601      0.147242     

In [16]:
# Geração dos relatórios
feature_names = X_train.columns  # Assumindo que os nomes das características são as colunas
print("feature_names: ", feature_names)

feature_names:  Index(['id_log', 'aluno', 'grupo', 'num_dia', 'num_log', 'log_type',
       'ultimo_passo_correto', 'verificado_com_mouse',
       'verificado_com_teclado', 'idle_time_acumulado',
       ...
       'misc_OI_Mt_Plus_Sb_total', 'misc_OI_Dv_Plus_Sb_total',
       'misc_EqSec_Distrib_MtTerm_total', 'misc_OI_Mt_Minus_Mt_Plus_total',
       'misc_OI_Mt_Minus_Mt_Minus_total', 'misc_OI_Dv_Plus_Ad_total',
       'misc_EqPrim_Mt_Inc_total', 'misc_EqPrim_Dv_Inc_total',
       'misc_OI_Dv_Minus_Dv_Minus_total',
       'misc_EqSec_OpFrac_MMC_MtNumerador_total'],
      dtype='object', length=336)


In [17]:
# Verificação antes de chamar a função de treinamento
print(f"X_train_preprocessed shape: {X_train_preprocessed.shape}")
print(f"y_train shape: {y_train_over.shape}")

X_train_preprocessed shape: (12765, 336)
y_train shape: (12765,)


# Treinamento dos Modelos usando Otimização Bayesiana (BayesSearchCV)

In [18]:
from bayesian_optimization_training import BayesianOptimizationTraining
cv = 2
n_iter = 3

# Escolher a métrica de avaliação
scoring_metric = 'roc_auc_ovr'  # Pode ser 'f1_macro', 'balanced_accuracy', 'roc_auc_ovr', etc.

# Instanciar a classe BayesianOptimizationTraining
training = BayesianOptimizationTraining()


# Chamar o treinamento com otimização bayesiana
trained_models = training.train_model(
    X_train_preprocessed, y_train_over, n_iter=n_iter, cv=cv, scoring=scoring_metric
)

Training and evaluating SVM with BayesianOptimization and rfe:
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 2/2] END classifier__C=2.7364528220782454, classifier__gamma=0.00032780432870046914, classifier__kernel=rbf, feature_selection__n_features_to_select=40;, score=0.776 total time= 6.2min
[CV 1/2] END classifier__C=2.7364528220782454, classifier__gamma=0.00032780432870046914, classifier__kernel=rbf, feature_selection__n_features_to_select=40;, score=0.766 total time= 6.6min
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 2/2] END classifier__C=2.5041499136197736, classifier__gamma=0.002061045404501547, classifier__kernel=rbf, feature_selection__n_features_to_select=40;, score=0.826 total time= 6.1min
[CV 1/2] END classifier__C=2.5041499136197736, classifier__gamma=0.002061045404501547, classifier__kernel=rbf, feature_selection__n_features_to_select=40;, score=0.820 total time= 6.5min
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 2/2]

# Avaliação dos Modelos

In [19]:
import importlib
import evaluation  # Importa o módulo evaluation
importlib.reload(evaluation)  # Recarrega o módulo evaluation

<module 'evaluation' from '/Users/patricia/Documents/code/python-code/behavior-detection/src/evaluation.py'>

In [20]:
from evaluation import Evaluation 

# Geração dos relatórios
feature_names = X_train.columns  # Assumindo que os nomes das características são as colunas
evaluation_results = Evaluation.evaluate_all_models(trained_models, X_train_preprocessed, y_train_over, X_test_preprocessed, y_test, feature_names)



# Geração dos Relatórios

In [21]:
import importlib
import report_formatter  # Importa o módulo evaluation
importlib.reload(report_formatter)  # Recarrega o módulo evaluation

<module 'report_formatter' from '/Users/patricia/Documents/code/python-code/behavior-detection/src/report_formatter.py'>

In [22]:
from report_formatter import ReportFormatter
from file_manager import FileManager

directory = "../output/"

# Impressão dos relatórios

# Gerar relatório textual a partir dos resultados de avaliação
text_report = ReportFormatter.generate_text_report_from_dict(evaluation_results)

# Imprimir ou salvar o relatório
FileManager.save_text_file_with_timestamp(text_report, "bayesian_optimization_report.txt", directory)

# Opcional: Gerar DataFrame detalhado e resumido dos relatórios
detailed_df = ReportFormatter.generate_detailed_report_dataframe(evaluation_results)
summary_df = ReportFormatter.generate_summary_report_dataframe(evaluation_results)

# Salvar os DataFrames como arquivos CSV, se necessário
FileManager.save_csv_file_with_timestamp(detailed_df, "detailed_report.csv", directory)
FileManager.save_csv_file_with_timestamp(summary_df, "summary_report.csv", directory)


'../output/summary_report_20240722_1125.csv'

# Salvando os modelos em arquivos para recuperação

In [23]:
from model_manager import ModelManager

# Caminhos
model_dir = "../models/"

# Salvar todos os modelos
saved_models = ModelManager.dump_all_models(trained_models, model_dir)
print("Modelos salvos:", saved_models)

Modelo 'SVM_rfe' salvo em: ../models/model_SVM_rfe_20240722_1125.pkl
Modelo 'SVM_pca' salvo em: ../models/model_SVM_pca_20240722_1125.pkl
Modelo 'KNN_rfe' salvo em: ../models/model_KNN_rfe_20240722_1125.pkl
Modelo 'KNN_pca' salvo em: ../models/model_KNN_pca_20240722_1125.pkl
Modelo 'XGBoost_rfe' salvo em: ../models/model_XGBoost_rfe_20240722_1125.pkl
Modelo 'XGBoost_pca' salvo em: ../models/model_XGBoost_pca_20240722_1125.pkl
Modelos salvos: ['../models/model_SVM_rfe_20240722_1125.pkl', '../models/model_SVM_pca_20240722_1125.pkl', '../models/model_KNN_rfe_20240722_1125.pkl', '../models/model_KNN_pca_20240722_1125.pkl', '../models/model_XGBoost_rfe_20240722_1125.pkl', '../models/model_XGBoost_pca_20240722_1125.pkl']
