# 1. Load Packages

In [None]:
from gandalf_doe.experiment import Experiment
from gandalf_doe.domain import Domain, Variable
import matplotlib.pylab as plt
from matplotlib import font_manager
from matplotlib.font_manager import FontProperties
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold, RepeatedStratifiedKFold, GridSearchCV
import pickle
import pandas as pd

# 2. Data Splitting

In [None]:
df_train = pd.read_excel("../test_data/dataset.xlsx", index_col=0, sheet_name="Training")
df_train.head(1)

In [None]:
df_test = pd.read_excel("../test_data/dataset.xlsx", index_col=0, sheet_name="Test")
df_test.head(1)

In [None]:
all_samples = pd.concat([df_train, df_test]).reset_index(drop=True)
all_samples.head(1)

In [None]:
# Determined by Mahalanobis distance

inter_train = all_samples.drop([5, 9, 12, 13, 16, 17, 21, 31, 37, 43])
inter_test = all_samples.loc[[5, 9, 12, 13, 16, 17, 21, 31, 37, 43]]

# 3. Data Normalization

In [None]:
variables = ['Temperature', 'Pressure', 'GHSV', 'Ni', 'Co', 'Calcination', 'Reduction']

In [None]:
scale_max = np.array([7.63e+02, 1.00e+01, 2.64e+04, 2.50e+01, 1.00e+01, 9.23e+02, 9.23e+02])
scale_min = np.array([5.23e+02, 1.00e+00, 3.30e+03, 0.00e+00, 0.00e+00, 6.23e+02, 6.23e+02])

In [None]:
domain = Domain()
domain.add_variable([Variable("Temperature", "reaction_temperature", np.arange(523, 783, 10), "discrete")])
domain.add_variable([Variable("Pressure", "reactor_pressure", np.arange(1, 11, 1), "discrete")])
domain.add_variable([Variable("GHSV", "gas_hourly_space_velocity", np.arange(3300, 28050, 1650), "discrete")])
domain.add_variable([Variable("Ni", "nickel_load", np.arange(0 , 26, 1), "discrete")])
domain.add_variable([Variable("Co", "cobalt_load", np.arange(0 , 11, 1), "discrete")])
domain.add_variable([Variable("Calcination", "calcination_temperature", np.arange(623 , 973, 50), "discrete")])
domain.add_variable([Variable("Reduction", "reduction_temperature", np.arange(623 , 973, 50), "discrete")])
domain.setup_space()
variables = [var.name for var in domain.variables]

# 4. Repeated k-fold Cross-Validation

In [None]:
def cross_validation_conversion(X, y, n_splits=8, n_repeats=4):
    outer_results = {"INDEX": [], "PRED": [], "TRUE": []}

    outer_cv = RepeatedStratifiedKFold(n_splits=n_splits, random_state=120897, n_repeats=n_repeats)

    for train_index, test_index in outer_cv.split(X, np.digitize(y, np.percentile(y, np.arange(0, 100, 10)))):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        variables = [var.name for var in domain.variables]
        exp = Experiment(domain, n_init=3, mode="EMOC", clustering=True, scaling=20000)
        
        sug = pd.DataFrame([])
        sug[variables] = X_train
        sug["output"] = y_train
        pred = exp.predict_outcome(x_new=X_test, previous=sug)["mean"].to_numpy()
        y_pred = np.clip(pred, 0, 100)
        
        # Store the results
        outer_results["INDEX"] += list(test_index)
        outer_results["PRED"] += list(y_pred)
        outer_results["TRUE"] += list(y_test)

    # Compute final metrics on the outer results
    res = pd.DataFrame(outer_results)
    true_data = res["TRUE"].to_numpy()
    pred_data = res["PRED"].to_numpy()

    mae = mean_absolute_error(true_data, pred_data)
    rmse = np.sqrt(mean_squared_error(true_data, pred_data))
    r2 = r2_score(true_data, pred_data)

    print(f"CV Performance: MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")
    
    return mae, rmse, r2

In [None]:
def cross_validation_sty(X, y, ghsv, n_splits=8, n_repeats=4):
    outer_results = {"INDEX": [], "PRED": [], "TRUE": []}

    outer_cv = RepeatedStratifiedKFold(n_splits=n_splits, random_state=120897, n_repeats=n_repeats)

    for train_index, test_index in outer_cv.split(X, np.digitize(y, np.percentile(y, np.arange(0, 100, 10)))):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ghsv_test = ghsv[test_index]

        variables = [var.name for var in domain.variables]
        exp = Experiment(domain, n_init=3, mode="EMOC", clustering=True, scaling=20000)
        
        sug = pd.DataFrame([])
        sug[variables] = X_train
        sug["output"] = y_train
        pred = exp.predict_outcome(x_new=X_test, previous=sug)["mean"].to_numpy()
        y_pred = np.clip(pred, 0, 100)
        preds = (y_pred / 100) * ghsv_test * (12.011 + 4*1.0079) / (5*22400)
        trues = (y_test / 100) * ghsv_test * (12.011 + 4*1.0079) / (5*22400)
        
        outer_results["INDEX"] += list(test_index)
        outer_results["PRED"] += list(preds)
        outer_results["TRUE"] += list(trues)

    res = pd.DataFrame(outer_results)
    true_data = res["TRUE"].to_numpy()
    pred_data = np.clip(res["PRED"].to_numpy(), 0, 100)

    mae = mean_absolute_error(true_data, pred_data)
    rmse = np.sqrt(mean_squared_error(true_data, pred_data))
    r2 = r2_score(true_data, pred_data)

    print(f"CV Performance: MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")
    
    return mae, rmse, r2

In [None]:
cross_validation_conversion(df_train[variables].to_numpy(), df_train["Conversion"].to_numpy())

In [None]:
cross_validation_sty(df_train[variables].to_numpy(), df_train["STY"].to_numpy(), df_train["GHSV"].to_numpy())

# Predict interpolation and extrapolation sets

## Conversion

In [None]:
exp = Experiment(domain, n_init=3, mode="EMOC", clustering=True, scaling=20000)

In [None]:
# Interpolation

sug = pd.DataFrame([])
sug[variables] = inter_train[variables].to_numpy()
sug["output"] = inter_train["Conversion"].to_numpy()
pred = exp.predict_outcome(x_new=inter_test[variables].to_numpy(), previous=sug)["mean"].to_numpy()
inter_preds = np.clip(pred, 0, 100)
print("MAE:", mean_absolute_error(inter_test["Conversion"].to_numpy(), inter_preds))
print("RMSE:", np.sqrt(mean_squared_error(inter_test["Conversion"].to_numpy(), inter_preds)))
print("R2:", r2_score(inter_test["Conversion"].to_numpy(), inter_preds))

In [None]:
# Extrapolation

sug = pd.DataFrame([])
sug[variables] = df_train[variables].to_numpy()
sug["output"] = df_train["Conversion"].to_numpy()
pred = exp.predict_outcome(x_new=df_test[variables].to_numpy(), previous=sug)["mean"].to_numpy()
extra_preds = np.clip(pred, 0, 100)
print("MAE:", mean_absolute_error(df_test["Conversion"].to_numpy(), extra_preds))
print("RMSE:", np.sqrt(mean_squared_error(df_test["Conversion"].to_numpy(), extra_preds)))
print("R2:", r2_score(df_test["Conversion"].to_numpy(), extra_preds))

## STY

In [None]:
exp = Experiment(domain, n_init=3, mode="EMOC", clustering=True, scaling=20000)

In [None]:
# Interpolation

sug = pd.DataFrame([])
sug[variables] = inter_train[variables].to_numpy()
sug["output"] = inter_train["Yield"].to_numpy() * 100
pred = exp.predict_outcome(x_new=inter_test[variables].to_numpy(), previous=sug)["mean"].to_numpy()
inter_preds = np.clip(pred, 0, 10000) / 10000 * inter_test["GHSV"].to_numpy() * (12.011 + 4*1.0079) / (5*22400)
print("MAE:", mean_absolute_error(inter_test["STY"].to_numpy(), inter_preds))
print("RMSE:", np.sqrt(mean_squared_error(inter_test["STY"].to_numpy(), inter_preds)))
print("R2:", r2_score(inter_test["STY"].to_numpy(), inter_preds))

In [None]:
# Extrapolation

sug = pd.DataFrame([])
sug[variables] = df_train[variables].to_numpy()
sug["output"] = df_train["Yield"].to_numpy() * 100
pred = exp.predict_outcome(x_new=df_test[variables].to_numpy(), previous=sug)["mean"].to_numpy()
extra_preds = np.clip(pred, 0, 10000) / 10000 * df_test["GHSV"].to_numpy() * (12.011 + 4*1.0079) / (5*22400)
print("MAE:", mean_absolute_error(df_test["STY"].to_numpy(), extra_preds))
print("RMSE:", np.sqrt(mean_squared_error(df_test["STY"].to_numpy(), extra_preds)))
print("R2:", r2_score(df_test["STY"].to_numpy(), extra_preds))