In [1]:
%load_ext autoreload
%autoreload 2
from condition_modeling import train
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import json
import pickle as pkl
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import gaussian_kde
import matplotlib.cm as cm
import time

In [None]:
# objectives = [
#     "calcine_temp",
#     "sinter_temp",
#     "anneal_temp",
#     "dry_temp",
#     "calcine_time",
#     "sinter_time",
#     "anneal_time",
#     "dry_time"
# ]
objectives = [
    "calcine_temp",
    "sinter_temp",
    "calcine_time",
    "sinter_time"
]

models = ["xgb", "lr", "rf", "nn"]
featurizations = ["mp_fraction", "pca"]

start = time.time()

results = []
for objective in objectives:
    for model in models:
        for featurization in featurizations:
            best_params, best_estimators, y_pred_train, y_pred_test, X_train_k, X_test_k, y_train_k, y_test_k, n_pts_train_k, n_pts_test_k = train(model=model, objective=objective, featurization=featurization)
            
            maes, rmses, r2s, mres = [], [], [], []
            for i, j in zip(y_test_k, y_pred_test):
                maes.append(mean_absolute_error(i, j))
                rmses.append(mean_squared_error(i, j, squared=False))
                mres.append(np.mean((np.abs(j-i)/i)*100))
                r2s.append(r2_score(i, j))

            result = {
                "objective": objective,
                "model": model,
                "featurization": featurization,
                "MAE": str(np.mean(maes)),
                "MAE_std": str(np.std(maes)),
                "RMSE": str(np.mean(rmses)),
                "RMSE_std": str(np.std(rmses)),
                "MRE": str(np.mean(mres)),
                "MRE_std": str(np.std(mres)),
                "R2": str(np.mean(r2s)),
                "R2_std": str(np.std(r2s))
            }
            results.append(result)
            with open('data/rxn_condition_log_full.json', 'w') as f:
                json.dump(results, f, indent=4)
            with open('data/time_log.txt', 'a') as f:
                f.writelines(str((time.time() - start)/60) + "\n")
                
            all_results = [best_params, best_estimators, y_pred_train, y_pred_test, X_train_k, X_test_k, y_train_k, y_test_k, n_pts_train_k, n_pts_test_k]
            with open(f'data/{model}_{featurization}_{objective}_data.pkl', 'wb') as f:
                pkl.dump(all_results, f)

Returning extracted data of 51574/31782 reactions.


HBox(children=(HTML(value='StrToComposition'), FloatProgress(value=0.0, max=12228.0), HTML(value='')))




HBox(children=(HTML(value='MultipleFeaturizer'), FloatProgress(value=0.0, max=12228.0), HTML(value='')))


Shape of X: (12228, 102)
Shape of y: (12228,)
100%|██████████| 100/100 [00:49<00:00,  2.01trial/s, best loss: 114.98240395033112]
100%|██████████| 100/100 [00:53<00:00,  1.87trial/s, best loss: 114.91136131047223]
100%|██████████| 100/100 [00:56<00:00,  1.78trial/s, best loss: 116.51989624088218]
100%|██████████| 100/100 [00:50<00:00,  1.99trial/s, best loss: 113.94643335644182]
100%|██████████| 100/100 [00:56<00:00,  1.78trial/s, best loss: 115.23764069520432]
100%|██████████| 100/100 [00:50<00:00,  1.96trial/s, best loss: 116.56743221315699]
100%|██████████| 100/100 [00:55<00:00,  1.81trial/s, best loss: 113.9735924179674]
100%|██████████| 100/100 [00:50<00:00,  1.97trial/s, best loss: 114.88502220889825]
100%|██████████| 100/100 [00:53<00:00,  1.89trial/s, best loss: 114.92161213520589]
100%|██████████| 100/100 [00:52<00:00,  1.90trial/s, best loss: 112.67867093937193]
Returning extracted data of 51574/31782 reactions.


HBox(children=(HTML(value='StrToComposition'), FloatProgress(value=0.0, max=12228.0), HTML(value='')))




HBox(children=(HTML(value='MultipleFeaturizer'), FloatProgress(value=0.0, max=12228.0), HTML(value='')))


Shape of X: (12228, 77)
Shape of y: (12228,)
100%|██████████| 100/100 [02:03<00:00,  1.23s/trial, best loss: 117.02490308278394]
 20%|██        | 20/100 [00:18<01:25,  1.07s/trial, best loss: 117.7716989271907]

In [31]:
maes, rmses, r2s, mres = [], [], [], []
for i, j in zip(y_test_k, y_pred_test):
    maes.append(mean_absolute_error(i, j))
    rmses.append(mean_squared_error(i, j, squared=False))
    mres.append(np.mean((np.abs(j-i)/i)*100))
    r2s.append(r2_score(i, j))

print(np.mean(maes), np.std(maes))
print(np.mean(rmses), np.std(rmses))
print(np.mean(r2s), np.std(r2s))
print(np.mean(mres), np.std(mres))

85.23843617451693 4.981680215544352
169.20423282351908 8.377821322858164
0.5600349498779509 0.0580263994536382
58.03733803912053 3.247330779799726


In [1]:
# make a regression figure
fig, ax = plt.subplots()
# to_plot_x = y_test_k[np.argmin(rmses)]
# to_plot_y = y_pred_test[np.argmin(rmses)]
to_plot_x = y_test_k[8]
to_plot_y = y_pred_test[8]

# Calculate the point density
xy = np.vstack([to_plot_x,to_plot_y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
to_plot_x, to_plot_y, z = to_plot_x[idx], to_plot_y[idx], z[idx]

ax_ = ax.scatter(to_plot_x, to_plot_y, c=z, cmap=cm.jet, s=25)
cbar = plt.colorbar(ax_)
cbar.set_label("KDE Density", labelpad=15, fontsize=14)

plt.xlim(0, 2000)
plt.ylim(0, 2000)
plt.plot([0, 2000], [0, 2000], color='black', linestyle='dashed')
ax.set_title("Sintering", fontsize=16)
ax.set_xlabel("True Temperature ($^{\circ}$C)", fontsize=14)
ax.set_ylabel("Predicted Temperature ($^{\circ}$C)", fontsize=14)
fig.tight_layout()
plt.show()
# fig.savefig("figures/xgb_sinter_parity.png", dpi=150)

NameError: name 'plt' is not defined