In [1]:
import training
import preprocessing as pre
import evaluation

In [2]:
# No notebook
data_path = '../data/new_logs_labels.csv'
X, y = pre.load_data(data_path)
y.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5489 entries, 0 to 5524
Columns: 336 entries, id_log to misc_EqSec_OpFrac_MMC_MtNumerador_total
dtypes: float64(82), int64(253), object(1)
memory usage: 14.1+ MB
<class 'pandas.core.series.Series'>
Index: 5489 entries, 0 to 5524
Series name: comportamento
Non-Null Count  Dtype 
--------------  ----- 
5489 non-null   object
dtypes: object(1)
memory usage: 85.8+ KB


0    ON TASK
1    ON TASK
2    ON TASK
3    ON TASK
4    ON TASK
Name: comportamento, dtype: object

In [3]:
test_size = 0.2  # 80% for training, 20% for testing
X_train, X_test, y_train, y_test = pre.split_train_test_data(X, y, test_size, random_state=42)


Tamanho do conjunto de treino: 4391
Tamanho do conjunto de teste: 1098


In [4]:
import pandas as pd

print("Nro de instancias de cada classe em y_train:\n")
print(pd.Series(y_train).value_counts())
print("\n\nNro de instancias de cada classe em y_test:\n")
print(pd.Series(y_test).value_counts())

Nro de instancias de cada classe em y_train:

ON TASK                 2553
ON SYSTEM                721
OFF TASK                 495
ON TASK CONVERSATION     321
ON TASK OUT              301
Name: count, dtype: int64


Nro de instancias de cada classe em y_test:

ON TASK                 606
ON SYSTEM               186
OFF TASK                134
ON TASK CONVERSATION     93
ON TASK OUT              79
Name: count, dtype: int64


In [5]:
y_train, label_encoder = pre.encode_labels(y_train)
y_test = label_encoder.transform(y_test)

In [6]:
# Pré-processar os dados uma vez
preprocessor = pre.create_preprocessor(X_train)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [7]:
print(X_train_preprocessed[:5])

[[0.65819387 0.75862069 0.77777778 ... 0.         0.         0.        ]
 [0.99729108 0.68965517 0.55555556 ... 1.         0.         0.        ]
 [0.99854326 0.89655172 0.         ... 1.         0.         0.        ]
 [0.99462421 0.20689655 0.88888889 ... 1.         0.         0.        ]
 [0.99504288 0.27586207 1.         ... 1.         0.         0.        ]]


In [8]:
print(X_train_preprocessed.shape)
import pandas as pd

# Supondo que 'X_train_preprocessed' seja seu numpy.ndarray
df = pd.DataFrame(X_train_preprocessed)

# Agora você pode chamar .describe() no DataFrame
print(df.describe())

(4391, 346)
               0            1            2            3            4    \
count  4391.000000  4391.000000  4391.000000  4391.000000  4391.000000   
mean      0.781210     0.495080     0.498823     0.494901     0.329507   
std       0.311243     0.296815     0.322054     0.316759     0.199229   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.576877     0.241379     0.222222     0.222222     0.160142   
50%       0.994437     0.482759     0.444444     0.444444     0.323843   
75%       0.997089     0.758621     0.777778     0.777778     0.485765   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

               5            6            7            8            9    ...  \
count  4391.000000  4391.000000  4391.000000  4391.000000  4391.000000  ...   
mean      0.053974     0.003872     0.066044     0.044669     0.028543  ...   
std       0.225992     0.062108     0.248388     0.099459     0.088200  ...   
min  

In [9]:
print("y_train: ", y_train[:5])

y_train:  [1 2 2 0 2]


In [10]:
# Geração dos relatórios
feature_names = X_train.columns  # Assumindo que os nomes das características são as colunas
print("feature_names: ", feature_names)

feature_names:  Index(['id_log', 'aluno', 'grupo', 'num_dia', 'num_log', 'log_type',
       'ultimo_passo_correto', 'verificado_com_mouse',
       'verificado_com_teclado', 'idle_time_acumulado',
       ...
       'misc_OI_Mt_Plus_Sb_total', 'misc_OI_Dv_Plus_Sb_total',
       'misc_EqSec_Distrib_MtTerm_total', 'misc_OI_Mt_Minus_Mt_Plus_total',
       'misc_OI_Mt_Minus_Mt_Minus_total', 'misc_OI_Dv_Plus_Ad_total',
       'misc_EqPrim_Mt_Inc_total', 'misc_EqPrim_Dv_Inc_total',
       'misc_OI_Dv_Minus_Dv_Minus_total',
       'misc_EqSec_OpFrac_MMC_MtNumerador_total'],
      dtype='object', length=336)


In [11]:
# Verificação antes de chamar a função de treinamento
print(f"X_train_preprocessed shape: {X_train_preprocessed.shape}")
print(f"y_train shape: {y_train.shape}")

X_train_preprocessed shape: (4391, 346)
y_train shape: (4391,)


In [12]:
cv = 10
n_iter = 100
classifier_params = {'n_estimators': 100, 'random_state': 42}

# Escolher a métrica de avaliação
scoring_metric = 'roc_auc_ovr'  # Pode ser 'f1_macro', 'balanced_accuracy', 'roc_auc_ovr', etc.

# Chamar o treinamento com otimização bayesiana
trained_models = training.train_model(
    X_train_preprocessed, y_train, training.BAYESIAN_OPTIMIZATION, n_iter=n_iter, cv=cv, scoring=scoring_metric
)


Training and evaluating Decision Tree with Bayesian Optimization and rfe:


In [None]:
# Caminhos
dirpath = "../output/"
model_dir = "../models/"

# Geração dos relatórios
feature_names = X_train.columns  # Assumindo que os nomes das características são as colunas
reports = evaluation.generate_reports(trained_models, X_train_preprocessed, y_train, X_test_preprocessed, y_test, feature_names)

# Impressão dos relatórios
print(evaluation.print_reports(reports, dirpath))

evaluation.save_reports_to_csv(reports, dirpath)

# Salvar todos os modelos
saved_models = evaluation.dump_all_models(trained_models, model_dir)
print("Modelos salvos:", saved_models)