## Imports

In [1]:
import os
path = "/Users/patricia/Documents/code/python-code/behavior-detection/src"
os.chdir(path)  # Muda o diretório para o nível anterior (a raiz do projeto)
print(os.getcwd())  # Verifique se agora está na raiz

/Users/patricia/Documents/code/python-code/behavior-detection/src


In [2]:
from sklearn import datasets
from time import time
import pandas as pd

## Data

In [3]:
digits = datasets.load_digits()

n_samples = len(digits.images)
X = digits.images
y = digits.target

# Flatten the images
X = X.reshape((n_samples, -1))
X = pd.DataFrame(X)

In [4]:
print(n_samples)
print(type(X))
print(X.head(5))
print(y[:5])

1797
<class 'pandas.core.frame.DataFrame'>
    0    1    2     3     4     5    6    7    8    9   ...   54   55   56  \
0  0.0  0.0  5.0  13.0   9.0   1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1  0.0  0.0  0.0  12.0  13.0   5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2  0.0  0.0  0.0   4.0  15.0  12.0  0.0  0.0  0.0  0.0  ...  5.0  0.0  0.0   
3  0.0  0.0  7.0  15.0  13.0   1.0  0.0  0.0  0.0  8.0  ...  9.0  0.0  0.0   
4  0.0  0.0  0.0   1.0  11.0   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

    57   58    59    60    61   62   63  
0  0.0  6.0  13.0  10.0   0.0  0.0  0.0  
1  0.0  0.0  11.0  16.0  10.0  0.0  0.0  
2  0.0  0.0   3.0  11.0  16.0  9.0  0.0  
3  0.0  7.0  13.0  13.0   9.0  0.0  0.0  
4  0.0  0.0   2.0  16.0   4.0  0.0  0.0  

[5 rows x 64 columns]
[0 1 2 3 4]


In [5]:
from sklearn.model_selection import train_test_split

# Dividir X e y em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing

In [6]:
import pandas as pd
from core.preprocessors.data_encoder import DataEncoder

# Codificar y_train
y_train = DataEncoder.encode_y(y_train)
# Codificar y_test
y_test = DataEncoder.encode_y(y_test)

# Pré-processar X_train
X_encoder = DataEncoder(num_classes=0, select_numerical=True)
X_encoder.fit(X_train)

X_train = X_encoder.transform(X_train)

# Pré-processar X_test usando o mesmo preprocessor
X_test = X_encoder.transform(X_test)

In [7]:
print(X_train.head(15))

      num_standard__x0  num_standard__x1  num_standard__x2  num_standard__x3  \
1734               0.0         -0.341698         -0.463360          0.508365   
855                0.0         -0.341698          0.784716         -0.658600   
1642               0.0         -0.341698         -1.087399         -0.425207   
175                0.0          0.759243          0.992729          0.975151   
925                0.0         -0.341698          0.160678          0.508365   
548                0.0         -0.341698          1.616768          0.975151   
1615               0.0         -0.341698         -1.087399         -0.658600   
334                0.0         -0.341698         -0.463360         -0.191814   
756                0.0         -0.341698         -1.087399         -0.658600   
433                0.0         -0.341698         -1.087399         -1.592172   
889                0.0          5.163004          2.240806          0.041579   
1675               0.0          0.759243

## Classifier

In [8]:
# Definir quais modelos e seletores utilizar
# selected_models = [ 
#     # 'Logistic Regression',
#     'Decision Tree',
#     # 'Random Forest',
#     # 'Gradient Boosting',
#     # 'SVM',
#     # 'KNN',
#     # 'XGBoost',
#     'Naive Bayes' 
#     # 'MLP'  
# ]

# # Definir quais seletores de features utilizar
# selected_selectors = [
#     # 'rfe',      # Recursive Feature Elimination
#     'pca',      # Principal Component Analysis
#     # 'rf',       # Random Forest Feature Selector
#     # 'mi',       # Mutual Information Feature Selector
#     'none'      # Sem seleção de features
# ]

from core.models.multiclass.digits_model_params import DigitsModelParams

# Criar instância dos parâmetros específicos para dígitos
model_params = DigitsModelParams()

# Usar todos os modelos disponíveis
selected_models = model_params.get_available_models()  # ou lista específica

# Usar todos os seletores disponíveis
selected_selectors = None  # None to use all selectors

from sklearn.model_selection import StratifiedKFold
# Criar validação cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv = 5  # Number of folds in the cross-validation
n_iter = 100
n_jobs = 6  # Number of processors to be used in the execution: -1 to use all processors

# Choose a scoring metric
scoring_metric = 'balanced_accuracy'  # Possible values: 'f1_macro', 'balanced_accuracy', 'roc_auc_ovr', etc.

## Usando Optuna (Otimização Bayesiana)

In [None]:
# Importação da nova classe OptunaBayesianOptimizationTraining
from core.training.optuna_bayesian_optimization_training import OptunaBayesianOptimizationTraining

# Instanciação da classe de treinamento com Otimização Bayesiana via Optuna
training = OptunaBayesianOptimizationTraining()

# Executar o treinamento
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    model_params=model_params,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,  # Será mapeado para n_trials na classe OptunaBayesianOptimizationTraining
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Exemplo de acesso aos modelos treinados
for model_key, model_info in trained_models.items():
    print(f"Modelo: {model_key}")
    print(f"Melhores Hiperparâmetros: {model_info['hyperparameters']}")
    print(f"Resultado CV: {model_info['cv_result']}\n")


2024-11-26 16:13:30,317 | optuna_training | INFO | Training and evaluating Logistic Regression with Optuna Optimization and rfe
INFO:optuna_training:Training and evaluating Logistic Regression with Optuna Optimization and rfe


Inside OptunaBayesianOptimizationTraining.optimize_model


### Avaliação e logging

In [None]:
from utils import notebook_utils as nb_utils

# Avaliação dos Modelos
class_metrics_results, avg_metrics_results = nb_utils.evaluate_models(trained_models, X_train, y_train, X_test, y_test)

# Geração dos Relatórios
nb_utils.generate_reports(class_metrics_results, avg_metrics_results, filename_prefix="_Optuna_")

# Salvando os modelos em arquivos para recuperação
nb_utils.save_models(trained_models, filename_prefix="_Optuna_")

In [None]:
print ("Fim do treinamento com OptunaBayesianOptimizationTraining")

## Treinando com RandomSearchCV

In [None]:
# src/notebooks/model_training_behavior_multiclassification_by_student_level.ipynb

from core.training.random_search_training import RandomSearchTraining

# Instantiate the RandomizedSearchCV training class
training = RandomSearchTraining()

# Execute the training
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    model_params=model_params,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Example of accessing the trained models
for model_key, model_info in trained_models.items():
    print(f"Model: {model_key}")
    print(f"Best Hyperparameters: {model_info['hyperparameters']}")
    print(f"CV Result: {model_info['cv_result']}\n")

### Avaliação e logging

In [None]:
from utils import notebook_utils as nb_utils

# Avaliação dos Modelos
class_metrics_results, avg_metrics_results = nb_utils.evaluate_models(trained_models, X_train, y_train, X_test, y_test)

# Geração dos Relatórios
nb_utils.generate_reports(class_metrics_results, avg_metrics_results, filename_prefix="_RandomSearch_")

# Salvando os modelos em arquivos para recuperação
nb_utils.save_models(trained_models, filename_prefix="_RandomSearch_")

## Using GridSearchCV

In [None]:
from core.training.grid_search_training import GridSearchTraining

# Instantiate the GridSearchCV training class
training = GridSearchTraining()

# Execute the training
trained_models = training.train_model(
    X_train=X_train,
    y_train=y_train,
    model_params=model_params,
    selected_models=selected_models,
    selected_selectors=selected_selectors,
    n_iter=n_iter,  # This parameter is not used in GridSearchCV but kept for consistency
    cv=cv,
    scoring=scoring_metric,
    n_jobs=n_jobs
)

# Example of accessing the trained models
for model_key, model_info in trained_models.items():
    print(f"Model: {model_key}")
    print(f"Best Hyperparameters: {model_info['hyperparameters']}")
    print(f"CV Result: {model_info['cv_result']}\n")

### Avaliação e logging

In [None]:
from utils import notebook_utils as nb_utils

# Avaliação dos Modelos
class_metrics_results, avg_metrics_results = nb_utils.evaluate_models(trained_models, X_train, y_train, X_test, y_test)

# Geração dos Relatórios
nb_utils.generate_reports(class_metrics_results, avg_metrics_results, filename_prefix="_GridSearch_")

# Salvando os modelos em arquivos para recuperação
nb_utils.save_models(trained_models, filename_prefix="_GridSearch_")