In [1]:
import math
import pickle
import optuna

import numpy as np

from collections import defaultdict

#scikit-learn
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from optuna.samplers import RandomSampler


In [2]:
# import data
oer_desc_data = pickle.load(open("racs_and_desc/data_mc3_lc0.p", "rb"))

oer_desc_vals = []
racs = []
catalyst_names = []

for name in oer_desc_data:
    oer_desc_vals.append(oer_desc_data[name][0])
    racs.append(oer_desc_data[name][1])
    catalyst_names.append(name)
racs = np.asarray(racs)

In [3]:
from sklearn.model_selection import KFold
#Step 1. Define an objective function to be maximized.
def objective(trial):
    # Define set of models we will study.
    regressor_name = trial.suggest_categorical("regressor", ["SVR", "RandomForest", "KRR_RBF", "KRR_Linear"])
    
    # Step 2. Setup values for the hyperparameters:
    if regressor_name == "SVR":
        epsilon = trial.suggest_float("epsilon", 1e-3, 1, log=True)
        regularization = trial.suggest_float("reg_svr", 1e-3, 10, log=True)
        regressor_obj = SVR(epsilon=epsilon, C=regularization)
    elif regressor_name == "RandomForest":
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        regressor_obj = RandomForestRegressor(
            max_depth=rf_max_depth, n_estimators=rf_n_estimators, random_state=42
        )
    elif regressor_name == "KRR_RBF":
        krr_alpha = trial.suggest_float("krr_alpha", 0.01, 1000)
        regressor_obj = KernelRidge(kernel="rbf", alpha=krr_alpha)
        
    elif regressor_name == "KRR_Linear":
        krr_l_alpha = trial.suggest_float("krr_l_alpha", 0.01, 1000)
        regressor_obj = KernelRidge(kernel="linear", alpha=krr_l_alpha)

    # Step 3: Scoring method:
    preds = []
    test_vals = []
    errs = []
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(racs, oer_desc_vals):
        sc = StandardScaler()
        X_train, X_test = racs[train_index], racs[test_index]
        y_train, y_test = np.asarray(oer_desc_vals)[train_index], np.asarray(oer_desc_vals)[test_index]
        scaled_X_train = sc.fit_transform(X_train)
        scaled_X_test = sc.transform(X_test)

        model = regressor_obj

        model.fit(scaled_X_train, y_train)

        pred = model.predict(scaled_X_test,)
        preds.append(pred[0])
        test_vals.append(y_test[0])
    accuracy_rmse = math.sqrt(mean_squared_error(np.asarray(test_vals), np.asarray(preds)))
    return accuracy_rmse

# Step 4: Running it, we need a RandomSample with a seed so that the results are deterministic, and match
# the values provided in the communication, but we have found the same errors (although different hyperparams)
# using different seeds.
study = optuna.create_study(direction="minimize", sampler=RandomSampler(seed=42))
study.optimize(objective, n_trials=100)

[32m[I 2021-09-22 15:36:31,711][0m A new study created in memory with name: no-name-d74f2715-009e-46f5-9c0a-92bd178676e9[0m
[32m[I 2021-09-22 15:37:15,524][0m Trial 0 finished with value: 0.10061614066038176 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 164, 'rf_max_depth': 2}. Best is trial 0 with value: 0.10061614066038176.[0m
[32m[I 2021-09-22 15:37:30,945][0m Trial 1 finished with value: 0.08407157432575921 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 30, 'rf_max_depth': 30}. Best is trial 1 with value: 0.08407157432575921.[0m
[32m[I 2021-09-22 15:37:32,565][0m Trial 2 finished with value: 0.10894694794638676 and parameters: {'regressor': 'SVR', 'epsilon': 0.008179499475211672, 'reg_svr': 0.12561043700013558}. Best is trial 1 with value: 0.08407157432575921.[0m
[32m[I 2021-09-22 15:37:34,306][0m Trial 3 finished with value: 1.273562937072672 and parameters: {'regressor': 'KRR_RBF', 'krr_alpha': 292.1517270887328}. Best is trial 1

[32m[I 2021-09-22 15:53:50,017][0m Trial 36 finished with value: 0.28547498405378946 and parameters: {'regressor': 'SVR', 'epsilon': 0.49620663324183856, 'reg_svr': 0.2665116749055355}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 15:53:51,050][0m Trial 37 finished with value: 1.1177404498494945 and parameters: {'regressor': 'KRR_RBF', 'krr_alpha': 160.81644333698446}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 15:57:22,035][0m Trial 38 finished with value: 0.09141002383734594 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 715, 'rf_max_depth': 3}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 15:57:22,704][0m Trial 39 finished with value: 1.5326124173361586 and parameters: {'regressor': 'KRR_Linear', 'krr_l_alpha': 657.6163161714204}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 15:57:25,354][0m Trial 40 finished with value: 0.08185311057932566 and parame

[32m[I 2021-09-22 16:25:38,915][0m Trial 73 finished with value: 1.5317147287908155 and parameters: {'regressor': 'KRR_Linear', 'krr_l_alpha': 389.20778671737577}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 16:32:35,605][0m Trial 74 finished with value: 0.08256144576524091 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 951, 'rf_max_depth': 28}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 16:35:07,584][0m Trial 75 finished with value: 0.08250369710945027 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 335, 'rf_max_depth': 12}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 16:37:10,057][0m Trial 76 finished with value: 0.10067101620216297 and parameters: {'regressor': 'RandomForest', 'rf_n_estimators': 499, 'rf_max_depth': 2}. Best is trial 9 with value: 0.07881403855665949.[0m
[32m[I 2021-09-22 16:37:10,918][0m Trial 77 finished with value: 1.0189917665012787 and