In [1]:
import os
path = "/Users/patricia/Documents/code/python-code/behavior-detection/src"
os.chdir(path)  # Muda o diretório para o nível anterior (a raiz do projeto)
print(os.getcwd())  # Verifique se agora está na raiz

/Users/patricia/Documents/code/python-code/behavior-detection/src


# Load data

In [2]:
from behavior.behavior_data_loader import BehaviorDataLoader

data_path = '../data/new_logs_labels.csv'

data = BehaviorDataLoader.load_data(data_path, delimiter=';')
print(data.shape)
data.head(5)

(5525, 372)


Unnamed: 0,id_log,aluno,grupo,num_dia,num_log,log_type,ultimo_passo_correto,verificado_com_mouse,verificado_com_teclado,idle_time_acumulado,...,comportamento_off_task,comportamento_on_system,comportamento_indefinido,ultimo_comportamento,ultimo_comportamento_on_task,ultimo_comportamento_on_task_conversation,ultimo_comportamento_on_task_out,ultimo_comportamento_off_task,ultimo_comportamento_on_system,ultimo_comportamento_indefinido
0,8224,1,2,1,1,step_verification,0,0,1,0,...,0,0,0,?,0,0,0,0,0,1
1,527786,1,2,1,2,user_idle,1,0,0,2,...,0,0,0,ON TASK,1,0,0,0,0,0
2,527787,1,2,1,3,user_idle,0,0,0,4,...,0,0,0,ON TASK,1,0,0,0,0,0
3,527788,1,2,1,4,user_idle,0,0,0,6,...,0,0,0,ON TASK,1,0,0,0,0,0
4,527789,1,2,1,5,user_idle,0,0,0,8,...,0,0,0,ON TASK,1,0,0,0,0,0


In [3]:
from core.preprocessors.data_cleaner import DataCleaner

print("Valores da coluna 'comportamento' antes da remoção:", data['comportamento'].value_counts())

# Remove instances where 'comportamento' is '?'
data = DataCleaner.remove_instances_with_value(data, 'comportamento', '?')

print("\nValores da coluna 'comportamento' depois da remoção:", data['comportamento'].value_counts())

Valores da coluna 'comportamento' antes da remoção: comportamento
ON TASK                 3159
ON SYSTEM                907
OFF TASK                 629
ON TASK CONVERSATION     414
ON TASK OUT              380
?                         36
Name: count, dtype: int64

Valores da coluna 'comportamento' depois da remoção: comportamento
ON TASK                 3159
ON SYSTEM                907
OFF TASK                 629
ON TASK CONVERSATION     414
ON TASK OUT              380
Name: count, dtype: int64


In [4]:
data.head(5)

Unnamed: 0,id_log,aluno,grupo,num_dia,num_log,log_type,ultimo_passo_correto,verificado_com_mouse,verificado_com_teclado,idle_time_acumulado,...,comportamento_off_task,comportamento_on_system,comportamento_indefinido,ultimo_comportamento,ultimo_comportamento_on_task,ultimo_comportamento_on_task_conversation,ultimo_comportamento_on_task_out,ultimo_comportamento_off_task,ultimo_comportamento_on_system,ultimo_comportamento_indefinido
0,8224,1,2,1,1,step_verification,0,0,1,0,...,0,0,0,?,0,0,0,0,0,1
1,527786,1,2,1,2,user_idle,1,0,0,2,...,0,0,0,ON TASK,1,0,0,0,0,0
2,527787,1,2,1,3,user_idle,0,0,0,4,...,0,0,0,ON TASK,1,0,0,0,0,0
3,527788,1,2,1,4,user_idle,0,0,0,6,...,0,0,0,ON TASK,1,0,0,0,0,0
4,527789,1,2,1,5,user_idle,0,0,0,8,...,0,0,0,ON TASK,1,0,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split

# Select a subset of the data only for testing purposes

print("Tamanho do dataframe antes:", data.shape)
data, _ = train_test_split(data, test_size=0.8, stratify=data['comportamento'], random_state=42)
data.reset_index(drop=True, inplace=True)
print("Tamanho do dataframe após:", data.shape)

Tamanho do dataframe antes: (5489, 372)
Tamanho do dataframe após: (1097, 372)


# Pre-processing

## Remove unnecessary columns

In [6]:
# Removing columns related to IDs, emotions, personality and behaviors, because 
# we want to classify behaviors only by the students' interactions with the system
columns_to_remove_ids = ['id_log', 'grupo', 'num_dia', 'num_log']
columns_to_remove_emotions = [
    'estado_afetivo', 'estado_engajamento_concentrado', 
    'estado_confusao', 'estado_frustracao', 'estado_tedio', 'estado_indefinido', 
    'ultimo_estado_afetivo', 'ultimo_engajamento_concentrado', 'ultimo_confusao', 
    'ultimo_frustracao', 'ultimo_tedio', 'ultimo_estado_indefinido'
]
columns_to_remove_personality = [
    'traco_amabilidade_fator', 'traco_extrovercao_fator', 'traco_conscienciosidade_fator', 
    'traco_abertura_fator', 'traco_neuroticismo_fator', 'traco_amabilidade_cat', 
    'traco_extrovercao_cat', 'traco_conscienciosidade_cat', 'traco_abertura_cat', 
    'traco_neuroticismo_cat']

columns_to_remove_behaviors = [
    'comportamento_on_task', 'comportamento_on_task_conversation', 'comportamento_on_task_out',
    'comportamento_off_task', 'comportamento_on_system', 'comportamento_indefinido',
    'ultimo_comportamento', 'ultimo_comportamento_on_task', 'ultimo_comportamento_on_task_conversation',
    'ultimo_comportamento_on_task_out', 'ultimo_comportamento_off_task', 'ultimo_comportamento_on_system',
    'ultimo_comportamento_indefinido'
]

columns_to_remove = columns_to_remove_ids + \
        columns_to_remove_emotions + \
        columns_to_remove_personality + \
        columns_to_remove_behaviors

cleaned_data = DataCleaner.remove_columns(data, columns_to_remove)


In [7]:
cleaned_data.head(5)

Unnamed: 0,aluno,log_type,ultimo_passo_correto,verificado_com_mouse,verificado_com_teclado,idle_time_acumulado,num_click_acumulado,num_click_passo,num_click_eq,type_step_verification,...,misc_OI_Dv_Plus_Sb_total,misc_EqSec_Distrib_MtTerm_total,misc_OI_Mt_Minus_Mt_Plus_total,misc_OI_Mt_Minus_Mt_Minus_total,misc_OI_Dv_Plus_Ad_total,misc_EqPrim_Mt_Inc_total,misc_EqPrim_Dv_Inc_total,misc_OI_Dv_Minus_Dv_Minus_total,misc_EqSec_OpFrac_MMC_MtNumerador_total,comportamento
0,18,user_idle,1,0,0,2,0,0,2,0,...,0,0,0,0,0,0,0,0,0,ON TASK OUT
1,1,user_idle,0,0,0,2,0,2,2,0,...,0,0,0,0,0,0,0,0,0,ON TASK
2,27,mouse_stop,0,0,0,0,0,20,23,0,...,0,0,0,0,0,0,0,0,0,ON TASK
3,19,user_idle,0,0,0,14,0,1,1,0,...,0,0,0,0,0,0,0,0,0,ON TASK CONVERSATION
4,4,user_idle,0,0,0,6,0,0,1,0,...,0,0,0,0,0,0,1,0,0,ON TASK OUT


In [8]:
# Preenche valores ausentes no DataFrame X com a string 'missing'.

cleaned_data = cleaned_data.fillna('missing')

## Split data by student level into training and test datasets

In [9]:
from core.preprocessors.data_splitter import DataSplitter

train_data, test_data = DataSplitter.split_by_student_level(cleaned_data, test_size=0.2, column_name='aluno')

In [10]:
# removing the 'aluno' column from the data after splitting into train and test sets

# Remover 'aluno' do conjunto de treinamento
cleaned_data = DataCleaner.remove_columns(train_data, ['aluno'])

# Remover 'aluno' do conjunto de teste
cleaned_data = DataCleaner.remove_columns(test_data, ['aluno'])

## Split data into Features (X) and Target (y)

In [11]:
from core.preprocessors.data_splitter import DataSplitter

# Conjunto de treinamento
X_train, y_train = DataSplitter.split_into_x_y(train_data, 'comportamento')

# Conjunto de teste
X_test, y_test = DataSplitter.split_into_x_y(test_data, 'comportamento')

In [12]:
print("Primeiras 5 instâncias de y_train:")
print(y_train[:5])

print("\nPrimeiras 5 instâncias de y_test:")
print(y_test[:5])

Primeiras 5 instâncias de y_train:
0             ON TASK OUT
1                 ON TASK
2                 ON TASK
3    ON TASK CONVERSATION
4             ON TASK OUT
Name: comportamento, dtype: object

Primeiras 5 instâncias de y_test:
8         ON TASK
9     ON TASK OUT
15    ON TASK OUT
16       OFF TASK
24      ON SYSTEM
Name: comportamento, dtype: object


## Encoding variables

### Encoding true labels (y)

In [13]:
import importlib
from core.preprocessors import column_selector, data_encoder
from behavior import behavior_data_encoder

# Recarregar o módulo para garantir que as alterações sejam aplicadas
importlib.reload(column_selector)
importlib.reload(data_encoder)
importlib.reload(behavior_data_encoder)

<module 'behavior.behavior_data_encoder' from '/Users/patricia/Documents/code/python-code/behavior-detection/src/behavior/behavior_data_encoder.py'>

In [14]:
# Encoding y_train and y_test
from behavior.behavior_data_encoder import BehaviorDataEncoder

# Codificar y_train
y_train = BehaviorDataEncoder.encode_y(y_train)

# Codificar y_test
y_test = BehaviorDataEncoder.encode_y(y_test)

### Encoding features (X)

In [15]:
# Pré-processar X_train
X_encoder = BehaviorDataEncoder(num_classes=5)
X_encoder.fit(X_train)

X_train = X_encoder.transform(X_train)

# Pré-processar X_test usando o mesmo preprocessor
X_test = X_encoder.transform(X_test)

In [16]:
print(X_test.shape)
print(X_test.head(10))

(195, 0)
Empty DataFrame
Columns: []
Index: [8, 9, 15, 16, 24, 25, 26, 27, 28, 39]


# Balanceamento dos dados

In [17]:
from core.preprocessors.data_balancer import DataBalancer

data_balancer = DataBalancer()
X_train, y_train = data_balancer.apply_smote(X_train, y_train)

ValueError: at least one array or dtype is required

In [None]:
from collections import Counter

print(f"Resampled dataset shape: {Counter(y_train)}")

# Treinamento dos Modelos

## Definindo parametros

In [19]:
# Definir quais modelos e seletores utilizar
selected_models = [ # None to use all models
    # 'Logistic Regression',
    'Decision Tree',
    # 'Random Forest',
    # 'Gradient Boosting',
    # 'SVM',
    # 'KNN',
    # 'XGBoost'
]
selected_selectors = ['pca']

cv = 5  # Number of folds in the cross-validation
n_iter = 100
n_jobs = 4  # Number of processors to be used in the execution: -1 to use all processors

# Choose a scoring metric
scoring_metric = 'balanced_accuracy'  # Possible values: 'f1_macro', 'balanced_accuracy', 'roc_auc_ovr', etc.

## Usando Otimização Bayesiana (BayesSearchCV)

In [28]:
from core.training.skopt_bayesian_optimization_training import SkoptBayesianOptimizationTraining

training = SkoptBayesianOptimizationTraining()

#### Executar o treinamento
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

#### Exemplo de acesso aos modelos treinados
for model_key, model_info in trained_models.items():
    print(f"Modelo: {model_key}")
    print(f"Melhores Hiperparâmetros: {model_info['hyperparameters']}")
    print(f"Resultado CV: {model_info['cv_result']}\n") 

## Usando Otimização Bayesiana (Optuna)

In [21]:
# Importação da nova classe OptunaBayesianOptimizationTraining
from core.training.optuna_bayesian_optimization_training import OptunaBayesianOptimizationTraining

# Instanciação da classe de treinamento com Otimização Bayesiana via Optuna
training = OptunaBayesianOptimizationTraining()

# Executar o treinamento
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,  # Será mapeado para n_trials na classe OptunaBayesianOptimizationTraining
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Exemplo de acesso aos modelos treinados
for model_key, model_info in trained_models.items():
    print(f"Modelo: {model_key}")
    print(f"Melhores Hiperparâmetros: {model_info['hyperparameters']}")
    print(f"Resultado CV: {model_info['cv_result']}\n")


## Using GridSearchCV

In [None]:
from core.training.grid_search_training import GridSearchTraining

# Instantiate the GridSearchCV training class
training = GridSearchTraining()

# Execute the training
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,  # This parameter is not used in GridSearchCV but kept for consistency
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Example of accessing the trained models
for model_key, model_info in trained_models.items():
    print(f"Model: {model_key}")
    print(f"Best Hyperparameters: {model_info['hyperparameters']}")
    print(f"CV Result: {model_info['cv_result']}\n")

### Treinando com RandomSearchCV

In [None]:
# src/notebooks/model_training_behavior_multiclassification_by_student_level.ipynb

from core.training.random_search_training import RandomSearchTraining

# Instantiate the RandomizedSearchCV training class
training = RandomSearchTraining()

# Execute the training
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Example of accessing the trained models
for model_key, model_info in trained_models.items():
    print(f"Model: {model_key}")
    print(f"Best Hyperparameters: {model_info['hyperparameters']}")
    print(f"CV Result: {model_info['cv_result']}\n")

# Avaliação dos Modelos

In [None]:
from core.evaluation.evaluation import Evaluation  

feature_names = X_train.columns  # Assumindo que os nomes das características são as colunas
class_metrics_results, avg_metrics_results = Evaluation.evaluate_all_models(trained_models, X_train, y_train, X_test, y_test, feature_names)

# Geração dos Relatórios

In [None]:
from core.logging.report_formatter import ReportFormatter
from core.logging.file_utils import FileUtils

directory = "../output/"

# Gerar relatório textual a partir dos resultados de avaliação
text_report = ReportFormatter.generate_text_report(class_metrics_results, avg_metrics_results)

# Imprimir ou salvar o relatório
FileUtils.save_file_with_timestamp(text_report, "general_report.txt", directory)

# Gerar DataFrame detalhado dos relatórios por classe
class_report_df = ReportFormatter.generate_class_report_dataframe(class_metrics_results)

# Gerar DataFrame resumido dos relatórios de métricas médias
avg_metrics_report_df = ReportFormatter.generate_avg_metrics_report_dataframe(avg_metrics_results)

# Salvar os DataFrames como arquivos CSV, se necessário
FileUtils.save_file_with_timestamp(class_report_df, "class_report.csv", directory, is_csv=True)
FileUtils.save_file_with_timestamp(avg_metrics_report_df, "avg_metrics_report.csv", directory, is_csv=True)


# Salvando os modelos em arquivos para recuperação

In [None]:
from core.logging.model_manager import ModelManager

# Caminhos
model_dir = "../models/"

# Salvar todos os modelos
saved_models = ModelManager.save_all_models(trained_models, model_dir)
print("Modelos salvos:", saved_models)