In [None]:
import commonutils

rootdir = "../datasets/ML_data/W4-11"
labelsfilename = "../datasets/ML_data/W4-11/labels.txt"
howmanydifs = 3

allvalues1, basic_pbedescriptors1, basic_hfdescriptors1 =\
      commonutils.read_dataset(rootdir, labelsfilename, howmanydifs)

print("Number of samples: ", len(allvalues1))
print("Number of basic PBE descriptors: ", len(basic_pbedescriptors1))
print("Number of basic  HF descriptors: ", len(basic_hfdescriptors1))

rootdir = "../datasets/ML_data/MB16-43"
labelsfilename = "../datasets/ML_data/MB16-43/labels.txt"
howmanydifs = 3

allvalues2, basic_pbedescriptors2, basic_hfdescriptors2 =\
      commonutils.read_dataset(rootdir, labelsfilename, howmanydifs)

print("Number of samples: ", len(allvalues2))
print("Number of basic PBE descriptors: ", len(basic_pbedescriptors2))
print("Number of basic  HF descriptors: ", len(basic_hfdescriptors2))

allvalues = allvalues1 + allvalues2

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

for methodid in range(howmanydifs):
    y_pred = []
    labels = []
    for val in allvalues:
        y_pred.append(val["label"] + val["difs"][methodid])
        labels.append(val["label"])
    
    print("Method", methodid+1, \
          "R2 score  : %9.3f"%(r2_score(labels, y_pred)))
    print("Method", methodid+1, \
          "RMSE score: %9.3f"%(mean_squared_error(labels, y_pred, squared=False)))

print("Our data")
y_pred = []
labels = []
for val in allvalues:
    labels.append(val["label"])
    y_pred.append(val["pbeenergydiff"]["PBE_FINAL_SINGLE_POINT_ENERGY"])

print("PBE R2 score  : %9.3f"%(r2_score(labels, y_pred)))
print("PBE RMSE score: %9.3f"%(mean_squared_error(labels, y_pred, squared=False)))

y_pred = []
labels = []
for val in allvalues:
    labels.append(val["label"])
    y_pred.append(val["hfenergydiff"]["HF_FINAL_SINGLE_POINT_ENERGY"])

print(" HF R2 score  : %9.3f"%(r2_score(labels, y_pred)))
print(" HF RMSE score: %9.3f"%(mean_squared_error(labels, y_pred, squared=False)))


In [None]:
# build correclation and print

import pandas as pd

fulldescriptors =[]
labels = []

for idx, val in enumerate(allvalues):
    fulldescriptors.append({})
    fulldescriptors[idx].update(val["pbeenergydiff"])
    fulldescriptors[idx].update(val["hfenergydiff"])

    labels.append(val["label"])

moldescriptors_featues, Y, features_names = \
    commonutils.build_XY_matrix (fulldescriptors, labels)

df = pd.DataFrame(moldescriptors_featues, columns=features_names)

top_corr = commonutils.get_top_correlations_blog(df, 0.85)
for tc in top_corr:
    print("%35s %35s %9.3f"%(tc[0], tc[1], tc[2]))

In [None]:
# PCA quick to see the data
from sklearn.decomposition import PCA
import numpy as np

fulldescriptors =[]
labels = []

for idx, val in enumerate(allvalues):
    fulldescriptors.append({})
    fulldescriptors[idx].update(val["pbeenergydiff"])
    fulldescriptors[idx].update(val["hfenergydiff"])

    labels.append(val["label"])

moldescriptors_featues, Y, features_names = \
    commonutils.build_XY_matrix (fulldescriptors, labels)

pca = PCA(n_components=3)
fit = pca.fit(moldescriptors_featues)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)

print("Loadings")
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=features_names)
print(loadings)

In [None]:
#Test using PLS 
import models

fulldescriptors =[]
labels = []

for idx, val in enumerate(allvalues):
    fulldescriptors.append({})
    fulldescriptors[idx].update(val["pbeenergydiff"])
    fulldescriptors[idx].update(val["hfenergydiff"])

    labels.append(val["label"])

moldescriptors_featues, Y, features_names = \
    commonutils.build_XY_matrix (fulldescriptors, labels)

maxcomp = moldescriptors_featues.shape[1]
# search fo the best number od components and build final model
perc_split = 0.2
ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
    models.pls_model (0.2, moldescriptors_featues, Y, \
                      ncomp_start = 1, ncomp_max = maxcomp)
r2max_comps = np.argmax(r2s_test)+1
rmsemin_comps = np.argmin(rmses_test)+1
compstouse = min(rmsemin_comps, r2max_comps)

perc_split = 0.2
rmse_train, rmse_test, r2_train, r2_test, rmse_full, r2_full , \
        plsmodel, X_train, X_test, y_train, y_test  = \
        models.pls_model (0.2, moldescriptors_featues, Y, False, compstouse)
perc_split = 0.0
rmse, r2 = models.pls_model (perc_split, moldescriptors_featues, Y, False, \
                  compstouse, leaveoneout=True)

print("PLS model with %d components"%(compstouse))
print("Train RMSE: %9.3f"%(rmse_train))
print("Test  RMSE: %9.3f"%(rmse_test))
print("Full  RMSE: %9.3f"%(rmse_full))
print("Train R2  : %9.3f"%(r2_train))
print("Test  R2  : %9.3f"%(r2_test))
print("Full  R2  : %9.3f"%(r2_full))
print("LOO   RMSE: %9.3f"%(rmse))
print("LOO   R2  : %9.3f"%(r2))
    

In [None]:
#test using permutation features importance
from sklearn.inspection import permutation_importance

scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error']
    
r_multi = permutation_importance(plsmodel, X_test, y_test, n_repeats=30, \
                                random_state=0, scoring=scoring)

for metric in r_multi:
    print(f"{metric}"+ " Used")
    r = r_multi[metric]
    for i in r.importances_mean.argsort()[::-1]:
        if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
            print(f"{features_names[i]:<30}"
              f"{r.importances_mean[i]:.3e}"
              f" +/- {r.importances_std[i]:.3e}")
    print("")