In [176]:
import pandas as pd
import numpy as np
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_curve, auc

In [177]:
h2o.init(port=12345)

Checking whether there is an H2O instance running at http://localhost:12345. connected.


0,1
H2O_cluster_uptime:,2 days 3 hours 27 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_lucasmalheiros_6hh0ls
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.470 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [178]:
def convert_to_h2o_frame(data: pd.DataFrame) -> h2o.H2OFrame:
    """Converte um pd.DataFrame em um h2o.H2OFrame"""
    print('Performing conversion from pd.DataFrame to h2o.H2OFrame...')
    # Colunas de datas
    cols_to_date = [col for col in data.dtypes[data.dtypes == 'datetime64[ns]'].index]
    # Colunas a converter para categórica
    cols_to_factor = [col for col in data.dtypes[(data.dtypes == 'object') | (data.dtypes == 'category')].index]
    # Colunas a converter a numeric
    cols_numerics = [col for col in data.dtypes[(data.dtypes != 'object') & (data.dtypes != 'category') & (data.dtypes != 'datetime64[ns]')].index]
    # Adicionar os tipos de colunas ao dicionário col_dtypes
    col_dtypes = {}
    for col in cols_to_date:
        col_dtypes[col] = 'time'
    for col in cols_to_factor:
        col_dtypes[col] = 'enum'
    for col in cols_numerics:
        col_dtypes[col] = 'numeric'
    # Converte a h2o
    data = h2o.H2OFrame(data, column_types=col_dtypes, na_strings=['NA', 'none', 'None', 'nan', 'NaN', '<NA>'])
    print('Successful conversion from pd.DataFrame to h2o.H2OFrame.')
    return data

# Data preparation

In [179]:
# Resultados
df_resultados = pd.read_csv('resultados_instancias_tcc.csv')
df_resultados = df_resultados[['Instancias'] + [col for col in df_resultados.columns if col.startswith('Obj_') or col.startswith('Time_')]]
df_resultados = df_resultados[df_resultados['Obj_RF_T_0'] != np.inf]  # Remove infactíveis

In [None]:
# Features criadas de create_features_dataset.py
df_features = pd.read_csv('multi_plant_instance_features.csv')

In [181]:
df = df_features.merge(df_resultados, left_on='instance', right_on='Instancias', how='inner').drop(columns=['Instancias', 'instance'])

In [182]:
df.head(3)

Unnamed: 0,num_products,num_plants,num_periods,binary_vars,total_demand,avg_demand_per_product,variance_demand_per_product,std_demand_per_product,mean_utilization,max_utilization,...,Obj_RF_4_2,Time_RF_4_2,Obj_RF_4_3,Time_RF_4_3,Obj_RF_6_0,Time_RF_6_0,Obj_RF_6_1,Time_RF_6_1,Obj_RF_T_0,Time_RF_T_0
0,120,2,12,2880,263136,91.366667,2807.950972,52.990103,1.000118,1.067356,...,,,,,829221.1,1800.13,811118.49,1200.63,809910.65,1800.1
1,60,20,12,14400,1305703,90.673819,2699.046453,51.952348,1.000154,1.143479,...,,,,,2404264.97,1069.06,2404219.27,1018.31,2404240.72,952.5
2,120,6,12,8640,788211,91.228125,2765.05363,52.583777,0.999516,1.092181,...,,,,,2000140.81,1800.32,1991086.02,1214.94,2106419.81,1800.2


# Construção de targets

In [183]:
obj_columns = [col for col in df.columns if col.startswith('Obj_')]
time_columns = [col for col in df.columns if col.startswith('Time_')]

## Target em função objetivo (BEST)

In [184]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST'] = df[obj_columns].idxmin(axis=1)
df['BEST'] = df['BEST'].str.replace('Obj_', '')

In [185]:
df['BEST'].value_counts()

BEST
RF_T_0    221
RF_6_0     73
RF_3_1     60
RF_2_0     52
RF_1_0     48
RF_6_1     45
RF_2_1     36
RF_3_0     33
RF_4_0     30
Name: count, dtype: int64

## Target com compromisso de tempo (BEST_TIME)

In [186]:
for obj_col, time_col in zip(obj_columns, time_columns):
    df[f'Adjusted_{obj_col}'] = (
        df[obj_col] * np.maximum(np.log(df[time_col]) / np.log(1000), 1)
    )
adjusted_columns = [col for col in df.columns if col.startswith('Adjusted_')]

In [187]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST_TIME'] = df[adjusted_columns].idxmin(axis=1)
df['BEST_TIME'] = df['BEST_TIME'].str.replace('Adjusted_Obj_', '')

In [188]:
df['BEST_TIME'].value_counts()

BEST_TIME
RF_T_0    119
RF_1_0    115
RF_3_1     72
RF_6_0     72
RF_6_1     69
RF_2_0     58
RF_2_1     35
RF_3_0     30
RF_4_0     28
Name: count, dtype: int64

# Preprocessamento de dados

## Drop Obj_ e Time_

In [189]:
df = df.drop(columns=obj_columns + time_columns + adjusted_columns)

In [190]:
# Embaralhar dataset
df = df.sample(frac=1)

In [191]:
# Transformar em classificação binária
df['BEST'] = df['BEST'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')
df['BEST_TIME'] = df['BEST_TIME'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')

## Convert para H2O frame

In [192]:
# Conversão para H2O frame
hf = convert_to_h2o_frame(df)

Performing conversion from pd.DataFrame to h2o.H2OFrame...
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Successful conversion from pd.DataFrame to h2o.H2OFrame.


In [193]:
# Train test split
hf_train, hf_test = hf.split_frame(ratios=[.8], seed=2112)

# Treino (GBM)

In [194]:
target = 'BEST_TIME'
predictors = [c for c in hf_train.columns if c not in ['BEST', 'BEST_TIME']]

In [195]:
gbm_model = H2OGradientBoostingEstimator(
    nfolds=20,
    keep_cross_validation_predictions=True,
    seed=2112,
    stopping_rounds=10,
    stopping_metric="AUTO",
    stopping_tolerance=0.001,
    balance_classes=False
)

In [196]:
xgboost_model = H2OXGBoostEstimator(booster='dart',
                                    normalize_type="tree",
                                    seed=2112,
                                    nfolds=10,
                                    keep_cross_validation_predictions=True)

In [197]:
# # Hiper Parâmetros
# hyper_params = {
#     "learn_rate": list(
#         np.round(np.arange(0.01, 0.1, 0.01), 3)
#     ),  # Narrowed range for learning rate list(np.round(np.arange(0.05, 0.3, 0.05), 3)),
#     "max_depth": list(
#         np.arange(5, 10, 1)
#     ),  # Reduced maximum depth list(np.arange(5, 15, 1)),
#     "ntrees": list(
#         np.arange(50, 150, 20)
#     ),  # Increased number of trees list(np.arange(20, 70, 10)),
#     "sample_rate": list(
#         np.round(np.arange(0.5, 0.9, 0.1), 3)
#     ),  # Adjusted sample rate range list(np.round(np.arange(0.5, 1.0, 0.1), 3)),
#     "col_sample_rate": list(
#         np.round(np.arange(0.5, 1.0, 0.1), 3)
#     ),  # Slightly adjusted col sample rate range list(np.round(np.arange(0.3, 1.0, 0.1), 3)),
# }
# # Tipo de pesquisa da grid. Em caso de RandomDiscrete, corre até max_models(n modelos)
# search_criteria = {
#     "strategy": "RandomDiscrete",
#     "max_models": 10,
#     # "max_runtime_secs": 180,
#     "seed": 1
# }

In [198]:
# # Construir grid com parâmetros e critérios de procura de melhor modelo
# grid = H2OGridSearch(model=gbm_model, 
#                      search_criteria=search_criteria, 
#                      hyper_params=hyper_params)
# # Treinar grid
# grid.train(x=predictors, y=target, training_frame=hf_train);

In [199]:
# gbm_grid = grid.get_grid(sort_by="rmse", decreasing=False)
# model = gbm_grid.models[0]

In [200]:
model = gbm_model
model.train(x=predictors, y=target, training_frame=hf_train);

gbm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


# Avaliação das métricas de treino

In [201]:
# Variáveis mais importantes na escolha
importancia_var = model.varimp(use_pandas=True)
importancia_var

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,total_transportation_cost,64.402725,1.0,0.192732
1,max_utilization,63.911087,0.992366,0.191261
2,avg_demand_to_setup_cost_ratio,30.042149,0.466473,0.089904
3,total_cost_to_demand_ratio,23.898941,0.371086,0.07152
4,demand_to_capacity_ratio,20.993773,0.325976,0.062826
5,num_plants,18.067631,0.280541,0.054069
6,std_setup_cost,13.769557,0.213804,0.041207
7,variance_setup_cost,11.323841,0.175829,0.033888
8,std_utilization,11.30448,0.175528,0.03383
9,avg_setup_cost,10.781529,0.167408,0.032265


In [202]:
# model.partial_plot(frame=hf_train, cols=['total_transportation_cost'], figsize=(5, 5), targets=['RF_T_0', 'RF_1_0']);

In [203]:
# Previsões
actual = hf_train[target].as_data_frame(use_multi_thread=True)[target]
predictions = model.cross_validation_holdout_predictions()
predict = predictions[0].as_data_frame(use_multi_thread=True)['predict']





In [204]:
# Confusion matrix
cm = confusion_matrix(actual, predict)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[ 73  25]
 [ 13 380]]


In [205]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.9226069246435845)

In [206]:
# F1 score
f1 = f1_score(actual, predict, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.9207


# Avaliação no dataset de teste

In [207]:
# Previsões sobre teste
actual_test = hf_test[target].as_data_frame(use_multi_thread=True)[target]
predictions_test = model.predict(hf_test)
predict_test = predictions_test[0].as_data_frame(use_multi_thread=True)['predict']

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%






In [208]:
# Confusion matrix
cm = confusion_matrix(actual_test, predict_test)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[16  5]
 [ 2 84]]


In [209]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.9345794392523364)

In [210]:
# F1 score
f1 = f1_score(actual_test, predict_test, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.9326
