In [162]:
import pandas as pd
import numpy as np
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_curve, auc

In [163]:
h2o.init(port=12345)

Checking whether there is an H2O instance running at http://localhost:12345. connected.


0,1
H2O_cluster_uptime:,2 hours 26 mins
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_lucasmalheiros_ikp0js
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.747 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [164]:
def convert_to_h2o_frame(data: pd.DataFrame) -> h2o.H2OFrame:
    """Converte um pd.DataFrame em um h2o.H2OFrame"""
    print('Performing conversion from pd.DataFrame to h2o.H2OFrame...')
    # Colunas de datas
    cols_to_date = [col for col in data.dtypes[data.dtypes == 'datetime64[ns]'].index]
    # Colunas a converter para categórica
    cols_to_factor = [col for col in data.dtypes[(data.dtypes == 'object') | (data.dtypes == 'category')].index]
    # Colunas a converter a numeric
    cols_numerics = [col for col in data.dtypes[(data.dtypes != 'object') & (data.dtypes != 'category') & (data.dtypes != 'datetime64[ns]')].index]
    # Adicionar os tipos de colunas ao dicionário col_dtypes
    col_dtypes = {}
    for col in cols_to_date:
        col_dtypes[col] = 'time'
    for col in cols_to_factor:
        col_dtypes[col] = 'enum'
    for col in cols_numerics:
        col_dtypes[col] = 'numeric'
    # Converte a h2o
    data = h2o.H2OFrame(data, column_types=col_dtypes, na_strings=['NA', 'none', 'None', 'nan', 'NaN', '<NA>'])
    print('Successful conversion from pd.DataFrame to h2o.H2OFrame.')
    return data

# Data preparation

In [165]:
# Resultados
df_resultados = pd.read_csv('resultados_instancias_tcc.csv')
df_resultados = df_resultados[['Instancias'] + [col for col in df_resultados.columns if col.startswith('Obj_') or col.startswith('Time_')]]
df_resultados = df_resultados[df_resultados['Obj_RF_T_0'] != np.inf]  # Remove infactíveis

In [166]:
# Features criadas de create_features_dataset.py
df_features = pd.read_csv('multi_plant_instance_features.csv')

In [167]:
df = df_features.merge(df_resultados, left_on='instance', right_on='Instancias', how='inner').drop(columns=['Instancias', 'instance'])

In [168]:
df.head(3)

Unnamed: 0,num_products,num_plants,num_periods,binary_vars,total_demand,min_demand,avg_demand,max_demand,std_demand,skew_demand,...,Obj_RF_6_2,Time_RF_6_2,Obj_RF_6_3,Time_RF_6_3,Obj_RF_6_4,Time_RF_6_4,Obj_RF_6_5,Time_RF_6_5,Obj_RF_T_0,Time_RF_T_0
0,120,2,12,2880,263136,1,91.366667,180,52.990103,-0.024016,...,810455.47,1202.5,,,,,,,809910.65,1800.1
1,60,20,12,14400,1305703,1,90.673819,180,51.952348,-0.010231,...,2404144.93,1320.7,,,,,,,2404240.72,952.5
2,120,6,12,8640,788211,1,91.228125,180,52.583777,-0.013395,...,1976005.9,1800.7,,,,,,,2106419.81,1800.2


# Construção de targets

In [169]:
obj_columns = [col for col in df.columns if col.startswith('Obj_')]
time_columns = [col for col in df.columns if col.startswith('Time_')]

## Target em função objetivo (BEST)

In [170]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST'] = df[obj_columns].idxmin(axis=1)
df['BEST'] = df['BEST'].str.replace('Obj_', '')

In [171]:
df['BEST'].value_counts()

BEST
RF_T_0    190
RF_2_0     50
RF_1_0     47
RF_6_0     42
RF_3_1     40
RF_4_2     34
RF_3_0     31
RF_6_2     29
RF_4_1     29
RF_2_1     29
RF_6_1     23
RF_3_2     21
RF_4_3     17
RF_4_0     16
Name: count, dtype: int64

## Target com compromisso de tempo (BEST_TIME)

In [172]:
for obj_col, time_col in zip(obj_columns, time_columns):
    df[f'Adjusted_{obj_col}'] = (
        df[obj_col] * np.maximum(np.log(df[time_col]) / np.log(1000), 1)
    )
adjusted_columns = [col for col in df.columns if col.startswith('Adjusted_')]

In [173]:
# Seleciona para cada instância o método com menor função objetivo para a coluna BEST
df['BEST_TIME'] = df[adjusted_columns].idxmin(axis=1)
df['BEST_TIME'] = df['BEST_TIME'].str.replace('Adjusted_Obj_', '')

In [174]:
df['BEST_TIME'].value_counts()

BEST_TIME
RF_1_0    111
RF_T_0    106
RF_2_0     57
RF_6_0     41
RF_6_2     39
RF_4_2     35
RF_4_3     31
RF_2_1     31
RF_3_1     29
RF_3_2     28
RF_6_1     28
RF_3_0     26
RF_4_1     20
RF_4_0     16
Name: count, dtype: int64

# Preprocessamento de dados

## Drop Obj_ e Time_

In [175]:
df = df.drop(columns=obj_columns + time_columns + adjusted_columns)

In [176]:
# Embaralhar dataset
df = df.sample(frac=1)

In [177]:
# Transformar em classificação binária
# df['BEST'] = df['BEST'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')
# df['BEST_TIME'] = df['BEST_TIME'].apply(lambda x: 'Gurobi' if x == 'RF_T_0' else 'RF')

## Convert para H2O frame

In [178]:
# Conversão para H2O frame
hf = convert_to_h2o_frame(df)

Performing conversion from pd.DataFrame to h2o.H2OFrame...
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Successful conversion from pd.DataFrame to h2o.H2OFrame.


In [179]:
# Train test split
hf_train, hf_test = hf.split_frame(ratios=[.8], seed=2112)

# Treino (GBM)

In [180]:
target = 'BEST_TIME'
predictors = [c for c in hf_train.columns if c not in ['BEST', 'BEST_TIME']]

In [181]:
gbm_model = H2OGradientBoostingEstimator(
    nfolds=20,
    keep_cross_validation_predictions=True,
    seed=2112,
    stopping_rounds=10,
    stopping_metric="AUTO",
    stopping_tolerance=0.001,
    balance_classes=False
)

In [182]:
xgboost_model = H2OXGBoostEstimator(booster='dart',
                                    normalize_type="tree",
                                    seed=2112,
                                    nfolds=10,
                                    keep_cross_validation_predictions=True)

In [183]:
# Hiper Parâmetros
hyper_params = {
    "learn_rate": list(
        np.round(np.arange(0.01, 0.1, 0.01), 3)
    ),  # Narrowed range for learning rate list(np.round(np.arange(0.05, 0.3, 0.05), 3)),
    "max_depth": list(
        np.arange(5, 10, 1)
    ),  # Reduced maximum depth list(np.arange(5, 15, 1)),
    "ntrees": list(
        np.arange(50, 150, 20)
    ),  # Increased number of trees list(np.arange(20, 70, 10)),
    "sample_rate": list(
        np.round(np.arange(0.5, 0.9, 0.1), 3)
    ),  # Adjusted sample rate range list(np.round(np.arange(0.5, 1.0, 0.1), 3)),
    "col_sample_rate": list(
        np.round(np.arange(0.5, 1.0, 0.1), 3)
    ),  # Slightly adjusted col sample rate range list(np.round(np.arange(0.3, 1.0, 0.1), 3)),
}
# Tipo de pesquisa da grid. Em caso de RandomDiscrete, corre até max_models(n modelos)
search_criteria = {
    "strategy": "RandomDiscrete",
    "max_models": 10,
    # "max_runtime_secs": 180,
    "seed": 1
}

In [184]:
# # Construir grid com parâmetros e critérios de procura de melhor modelo
# grid = H2OGridSearch(model=gbm_model, 
#                      search_criteria=search_criteria, 
#                      hyper_params=hyper_params)
# # Treinar grid
# grid.train(x=predictors, y=target, training_frame=hf_train)

In [185]:
# gbm_grid = grid.get_grid(sort_by="rmse", decreasing=False)
# model = gbm_grid.models[0]

In [186]:
model = gbm_model
model.train(x=predictors, y=target, training_frame=hf_train);

gbm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


# Avaliação das métricas de treino

In [187]:
# Variáveis mais importantes na escolha
importancia_var = model.varimp(use_pandas=True)
importancia_var

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,total_setup_cost,155.342667,1.000000,0.124708
1,total_utilization,50.985378,0.328212,0.040931
2,p25_setup_cost,49.838528,0.320830,0.040010
3,max_utilization,40.991787,0.263880,0.032908
4,min_capacity,39.677860,0.255422,0.031853
...,...,...,...,...
110,p75_inventory_cost,0.003626,0.000023,0.000003
111,num_products,0.000000,0.000000,0.000000
112,min_demand,0.000000,0.000000,0.000000
113,min_production_time,0.000000,0.000000,0.000000


In [190]:
# model.partial_plot(frame=hf_train, cols=['max_utilization'], figsize=(5, 5))#, targets=['RF_T_0', 'RF_1_0']);

In [191]:
# Previsões
actual = hf_train[target].as_data_frame(use_multi_thread=True)[target]
predictions = model.cross_validation_holdout_predictions()
predict = predictions[0].as_data_frame(use_multi_thread=True)['predict']





In [192]:
# Confusion matrix
cm = confusion_matrix(actual, predict)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[79  8  0  2  0  1  0  0  0  0  1  0  0  0]
 [16 16  3  4  1  0  2  1  0  0  3  0  0  0]
 [ 2  4 16  0  1  0  0  0  1  0  0  0  1  0]
 [ 2  7  0  5  1  0  0  1  2  0  2  1  1  0]
 [ 2  4  0  2  5  2  0  3  1  2  1  0  0  1]
 [ 0  0  0  0  2  7  1  1  2  5  2  1  2  3]
 [ 2  0  1  1  0  0  1  0  1  0  2  0  1  1]
 [ 3  1  1  0  3  3  0  0  3  1  1  0  1  1]
 [ 0  0  0  1  1  0  0  4  3  6  2  2  4  3]
 [ 0  0  0  0  2  3  0  1  4  3  1  0  6  2]
 [ 0  5  0  2  0  1  3  1  1  1  5  1  6  8]
 [ 0  2  1  1  0  0  0  1  4  0  2  3  6  4]
 [ 1  0  0  0  0  3  1  1  1  5  6  1 11  7]
 [ 4  0  0  0  0  3  0  0  0  3  8  3  4 62]]


In [193]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.439918533604888)

In [194]:
recall_per_class = np.diag(cm) / np.sum(cm, axis=1)
macro_average_recall = np.mean(recall_per_class)
print(f"Macro-Average Recall: {macro_average_recall:.4f}")

Macro-Average Recall: 0.3003


In [195]:
# F1 score
f1 = f1_score(actual, predict, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.4261


# Avaliação no dataset de teste

In [196]:
# Previsões sobre teste
actual_test = hf_test[target].as_data_frame(use_multi_thread=True)[target]
predictions_test = model.predict(hf_test)
predict_test = predictions_test[0].as_data_frame(use_multi_thread=True)['predict']

gbm prediction progress: |




███████████████████████████████████████████████████████| (done) 100%





In [197]:
# Confusion matrix
cm = confusion_matrix(actual_test, predict_test)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[16  3  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 6  1  2  0  0  0  1  0  0  0  0  1  0  0]
 [ 1  0  3  0  0  0  0  1  0  0  0  1  0  0]
 [ 2  0  1  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  1  4  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 1  0  0  1  1  0  0  1  0  0  1  1  0  0]
 [ 0  0  0  1  0  0  0  1  0  0  0  0  0  0]
 [ 0  1  0  0  2  0  1  0  1  2  0  1  0  1]
 [ 0  0  0  0  1  0  0  0  1  1  0  0  3  3]
 [ 0  0  0  0  1  2  0  0  0  0  1  1  1  1]
 [ 0  0  0  1  0  0  0  0  0  0  2  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  1  0]
 [ 0  0  0  0  1  1  1  0  0  0  1  0  3 12]]


In [198]:
# Precisão
cm.trace() / cm.sum()

np.float64(0.40186915887850466)

In [199]:
# F1 score
f1 = f1_score(actual_test, predict_test, average='weighted')
print(f'\nF1 Score: {f1:.4f}')


F1 Score: 0.3730


In [200]:
# h2o.cluster().shutdown()