In [75]:
import pandas as pd
import numpy as np
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_curve, auc

In [76]:
h2o.init(port=12345)

Checking whether there is an H2O instance running at http://localhost:12345..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.5" 2024-10-15; OpenJDK Runtime Environment (build 21.0.5+11-Ubuntu-1ubuntu124.04); OpenJDK 64-Bit Server VM (build 21.0.5+11-Ubuntu-1ubuntu124.04, mixed mode, sharing)
  Starting server from /home/lucasmalheiros/Documents/Pesquisa Operacional/tcc-relax-and-fix/venv/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpxa073qfg
  JVM stdout: /tmp/tmpxa073qfg/h2o_lucasmalheiros_started_from_python.out
  JVM stderr: /tmp/tmpxa073qfg/h2o_lucasmalheiros_started_from_python.err
  Server is running at http://127.0.0.1:12345
Connecting to H2O server at http://127.0.0.1:12345 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,H2O_from_python_lucasmalheiros_n6s9g3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.868 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [77]:
def convert_to_h2o_frame(data: pd.DataFrame) -> h2o.H2OFrame:
    """Converte um pd.DataFrame em um h2o.H2OFrame"""
    print('Performing conversion from pd.DataFrame to h2o.H2OFrame...')
    # Colunas de datas
    cols_to_date = [col for col in data.dtypes[data.dtypes == 'datetime64[ns]'].index]
    # Colunas a converter para categórica
    cols_to_factor = [col for col in data.dtypes[(data.dtypes == 'object') | (data.dtypes == 'category')].index]
    # Colunas a converter a numeric
    cols_numerics = [col for col in data.dtypes[(data.dtypes != 'object') & (data.dtypes != 'category') & (data.dtypes != 'datetime64[ns]')].index]
    # Adicionar os tipos de colunas ao dicionário col_dtypes
    col_dtypes = {}
    for col in cols_to_date:
        col_dtypes[col] = 'time'
    for col in cols_to_factor:
        col_dtypes[col] = 'enum'
    for col in cols_numerics:
        col_dtypes[col] = 'numeric'
    # Converte a h2o
    data = h2o.H2OFrame(data, column_types=col_dtypes, na_strings=['NA', 'none', 'None', 'nan', 'NaN', '<NA>'])
    print('Successful conversion from pd.DataFrame to h2o.H2OFrame.')
    return data

# Data preparation

In [78]:
# Resultados
df_resultados = pd.read_csv('resultados_instancias_tcc.csv')
df_resultados = df_resultados[['Instancias'] + [col for col in df_resultados.columns if col.startswith('Obj_') or col.startswith('Time_')]]
df_resultados = df_resultados[df_resultados['Obj_RF_T_0'] != np.inf]  # Remove infactíveis

In [79]:
# Features criadas de create_features_dataset.py
df_features = pd.read_csv('multi_plant_instance_features.csv')

In [80]:
df = df_features.merge(df_resultados, left_on='instance', right_on='Instancias', how='inner').drop(columns=['Instancias', 'instance'])

In [81]:
df.head(3)

Unnamed: 0,num_products,num_plants,num_periods,binary_vars,total_demand,avg_demand_per_product,variance_demand_per_product,std_demand_per_product,mean_utilization,max_utilization,...,Obj_RF_4_2,Time_RF_4_2,Obj_RF_4_3,Time_RF_4_3,Obj_RF_6_0,Time_RF_6_0,Obj_RF_6_1,Time_RF_6_1,Obj_RF_T_0,Time_RF_T_0
0,120,2,12,2880,263136,91.366667,2807.950972,52.990103,1.000118,1.067356,...,,,,,829221.1,1800.13,811118.49,1200.63,809910.65,1800.1
1,60,20,12,14400,1305703,90.673819,2699.046453,51.952348,1.000154,1.143479,...,,,,,2404264.97,1069.06,2404219.27,1018.31,2404240.72,952.5
2,120,6,12,8640,788211,91.228125,2765.05363,52.583777,0.999516,1.092181,...,,,,,2000140.81,1800.32,1991086.02,1214.94,2106419.81,1800.2


# Construção de targets

In [82]:
obj_columns = [col for col in df.columns if col.startswith('Obj_')]
time_columns = [col for col in df.columns if col.startswith('Time_')]

## Target em função objetivo (BEST)

In [83]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST'] = df[obj_columns].idxmin(axis=1)
df['BEST'] = df['BEST'].str.replace('Obj_', '')

In [84]:
df['BEST'].value_counts()

BEST
RF_T_0    221
RF_6_0     73
RF_3_1     60
RF_2_0     52
RF_1_0     48
RF_6_1     45
RF_2_1     36
RF_3_0     33
RF_4_0     30
Name: count, dtype: int64

## Target com compromisso de tempo (BEST_TIME)

In [85]:
for obj_col, time_col in zip(obj_columns, time_columns):
    df[f'Adjusted_{obj_col}'] = (
        df[obj_col] * np.maximum(np.log(df[time_col]) / np.log(1000), 1)
    )
adjusted_columns = [col for col in df.columns if col.startswith('Adjusted_')]

In [86]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST_TIME'] = df[adjusted_columns].idxmin(axis=1)
df['BEST_TIME'] = df['BEST_TIME'].str.replace('Adjusted_Obj_', '')

In [87]:
df['BEST_TIME'].value_counts()

BEST_TIME
RF_T_0    119
RF_1_0    115
RF_3_1     72
RF_6_0     72
RF_6_1     69
RF_2_0     58
RF_2_1     35
RF_3_0     30
RF_4_0     28
Name: count, dtype: int64

# Preprocessamento de dados

## Drop Obj_ e Time_

In [88]:
df = df.drop(columns=obj_columns + time_columns + adjusted_columns)

In [89]:
# Embaralhar dataset
df = df.sample(frac=1)

In [90]:
# Transformar em classificação binária
# df['BEST'] = df['BEST'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')
# df['BEST_TIME'] = df['BEST_TIME'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')

## Convert para H2O frame

In [91]:
# Conversão para H2O frame
hf = convert_to_h2o_frame(df)

Performing conversion from pd.DataFrame to h2o.H2OFrame...
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Successful conversion from pd.DataFrame to h2o.H2OFrame.


In [None]:
# Train test split
hf_train, hf_test = hf.split_frame(ratios=[.8], seed=2112)

# Treino (GBM)

In [93]:
target = 'BEST_TIME'
predictors = [c for c in hf_train.columns if c not in ['BEST', 'BEST_TIME']]

In [94]:
gbm_model = H2OGradientBoostingEstimator(
    nfolds=20,
    keep_cross_validation_predictions=True,
    seed=2112,
    stopping_rounds=10,
    stopping_metric="AUTO",
    stopping_tolerance=0.001,
    balance_classes=False
)

In [95]:
xgboost_model = H2OXGBoostEstimator(booster='dart',
                                    normalize_type="tree",
                                    seed=2112,
                                    nfolds=10,
                                    keep_cross_validation_predictions=True)

In [None]:
# Hiper Parâmetros
hyper_params = {
    "learn_rate": list(
        np.round(np.arange(0.01, 0.1, 0.01), 3)
    ),  # Narrowed range for learning rate list(np.round(np.arange(0.05, 0.3, 0.05), 3)),
    "max_depth": list(
        np.arange(5, 10, 1)
    ),  # Reduced maximum depth list(np.arange(5, 15, 1)),
    "ntrees": list(
        np.arange(50, 150, 20)
    ),  # Increased number of trees list(np.arange(20, 70, 10)),
    "sample_rate": list(
        np.round(np.arange(0.5, 0.9, 0.1), 3)
    ),  # Adjusted sample rate range list(np.round(np.arange(0.5, 1.0, 0.1), 3)),
    "col_sample_rate": list(
        np.round(np.arange(0.5, 1.0, 0.1), 3)
    ),  # Slightly adjusted col sample rate range list(np.round(np.arange(0.3, 1.0, 0.1), 3)),
}
# Tipo de pesquisa da grid. Em caso de RandomDiscrete, corre até max_models(n modelos)
search_criteria = {
    "strategy": "RandomDiscrete",
    "max_models": 10,
    # "max_runtime_secs": 180,
    "seed": 1
}

In [None]:
# Construir grid com parâmetros e critérios de procura de melhor modelo
grid = H2OGridSearch(model=gbm_model, 
                     search_criteria=search_criteria, 
                     hyper_params=hyper_params)
# Treinar grid
grid.train(x=predictors, y=target, training_frame=hf_train)

In [98]:
# gbm_grid = grid.get_grid(sort_by="rmse", decreasing=False)
# model = gbm_grid.models[0]

In [99]:
model = gbm_model
model.train(x=predictors, y=target, training_frame=hf_train);

gbm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


# Avaliação das métricas de treino

In [100]:
# Variáveis mais importantes na escolha
importancia_var = model.varimp(use_pandas=True)
importancia_var

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,max_utilization,108.475243,1.0,0.080883
1,demand_to_cost_interaction,102.533195,0.945222,0.076452
2,time_per_unit_of_cost,77.153641,0.711256,0.057528
3,min_utilization,67.361542,0.620985,0.050227
4,kurt_capacity,65.257256,0.601587,0.048658
5,avg_demand_to_setup_cost_ratio,64.747795,0.59689,0.048278
6,demand_to_capacity_ratio,62.153107,0.57297,0.046343
7,skew_transportation_cost,50.027954,0.461192,0.037303
8,avg_setup_cost,49.348167,0.454926,0.036796
9,kurt_transportation_cost,48.113979,0.443548,0.035875


In [101]:
# model.partial_plot(frame=hf_train, cols=['total_transportation_cost'], figsize=(5, 5), targets=['RF_T_0', 'RF_1_0']);

In [102]:
# Previsões
actual = hf_train[target].as_data_frame(use_multi_thread=True)[target]
predictions = model.cross_validation_holdout_predictions()
predict = predictions[0].as_data_frame(use_multi_thread=True)['predict']





In [103]:
# Confusion matrix
cm = confusion_matrix(actual, predict)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[80  8  1  1  2  0  0  1  0]
 [16 14  4  3  2  0  2  3  0]
 [ 3  3 16  1  3  1  0  4  0]
 [ 1  5  1  7  2  2  3  3  0]
 [ 3  1  3  4 33  4  4  5  3]
 [ 4  0  1  1  6  5  5  3  1]
 [ 0  2  0  2  2  4 23  9 14]
 [ 2  0  3  2  8  3 12 18  9]
 [ 4  1  0  1  1  1 13  8 71]]


In [105]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.5437881873727087)

In [120]:
recall_per_class = np.diag(cm) / np.sum(cm, axis=1)
macro_average_recall = np.mean(recall_per_class)
print(f"Macro-Average Recall: {macro_average_recall:.4f}")

Macro-Average Recall: 0.4251


In [106]:
# F1 score
f1 = f1_score(actual, predict, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.5353


# Avaliação no dataset de teste

In [107]:
# Previsões sobre teste
actual_test = hf_test[target].as_data_frame(use_multi_thread=True)[target]
predictions_test = model.predict(hf_test)
predict_test = predictions_test[0].as_data_frame(use_multi_thread=True)['predict']




gbm prediction progress: |

███████████████████████████████████████████████████████| (done) 100%





In [108]:
# Confusion matrix
cm = confusion_matrix(actual_test, predict_test)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[14  5  1  0  0  1  0  0  1]
 [ 5  5  0  3  1  0  0  0  0]
 [ 0  1  1  0  2  0  0  0  0]
 [ 1  0  0  1  0  1  1  1  1]
 [ 0  0  0  2  9  0  0  0  1]
 [ 0  0  0  0  1  0  0  0  1]
 [ 0  0  0  2  1  0  7  2  4]
 [ 0  0  0  2  3  0  0  4  3]
 [ 0  0  0  0  0  0  1  1 17]]


In [109]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.5420560747663551)

In [110]:
# F1 score
f1 = f1_score(actual_test, predict_test, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.5355


In [111]:
h2o.cluster().shutdown()

H2O session _sid_8a5b closed.
