In [None]:
import commonutils

DEBUG = False

setnames = ["W4-11", "MB16-43"]
howmanydifs = 3
methods = {"PBE" : ["Nuclear Repulsion  :", \
                    "One Electron Energy:", \
                    "Two Electron Energy:", \
                    "Potential Energy   :", \
                    "Kinetic Energy     :", \
                    "E(X)               :"  , \
                    "E(C)               :"  , \
                    "Dispersion correction", \
                    "FINAL SINGLE POINT ENERGY"], 
            "HF" : ["Nuclear Repulsion  :", \
                    "One Electron Energy:", \
                    "Two Electron Energy:", \
                    "Potential Energy   :", \
                    "Kinetic Energy     :", \
                    "Dispersion correction", \
                    "FINAL SINGLE POINT ENERGY"]
            }

allvalues_perset = {}

allvalues = []
for i, setname in enumerate(setnames):
      print("Reading dataset: ", setname)
      rootdir = "../datasets/ML_data/" + setname
      labelsfilename = "../datasets/ML_data/"+setname+"/labels.txt"
      allvalues_perset[setname] = None

      allvalues_perset[setname] =\
            commonutils.read_dataset(rootdir, labelsfilename, howmanydifs, methods)
      
      print("Number of samples: ", len(allvalues_perset[setname]))
      print("Number of basic PBE descriptors: ", len(allvalues_perset[setname]))
      print("Number of basic  HF descriptors: ", len(allvalues_perset[setname]))
      
      allvalues += allvalues_perset[setname]
      print("")

allvalues_perset["Full"] = allvalues      

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print("")
for methodid in range(howmanydifs):
    for setname in setnames + ["Full"]:
        print("%10s, %10s , "%(str(methodid), setname), end="")
        y_pred = []
        labels = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])
            labels.append(val["label"])
        print("%9.3f , "%(r2_score(labels, y_pred)), end="")
        print("%9.3f "%(mean_squared_error(labels, y_pred, squared=False)), end="")    
        print("")
for method in methods:
    for setname in setnames + ["Full"]:
        print("%10s, %10s , "%("Our_"+str(method), setname), end="")
        y_pred = []
        labels = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])
            labels.append(val["label"])
        print("%9.3f , "%(r2_score(labels, y_pred)), end="")
        print("%9.3f "%(mean_squared_error(labels, y_pred, squared=False)), end="")    
        print("")

In [None]:
# build correclation and print

import pandas as pd

fulldescriptors = {}
labels = {}
top_correlation_perset = {}

for setname in setnames + ["Full"]:
    fulldescriptors[setname] = []
    labels[setname] = []
    for idx, val in enumerate(allvalues_perset[setname]):
        fulldescriptors[setname].append({})
        for method in methods:
            fulldescriptors[setname][idx].update(val[method+"_energydiff"])

        labels[setname].append(val["label"])

    moldescriptors_featues, Y, features_names = \
        commonutils.build_XY_matrix (fulldescriptors[setname], labels[setname])

    df = pd.DataFrame(moldescriptors_featues, columns=features_names)

    top_corr = commonutils.get_top_correlations_blog(df, 0.95)

    top_correlation_perset[setname] = top_corr
    if DEBUG:
        print("Top correlations for set: ", setname)
        for tc in top_corr:
            print("%35s %35s %9.3f"%(tc[0], tc[1], tc[2]))
        print("")

In [None]:
# PCA quick to see the data

if DEBUG:
    from sklearn.decomposition import PCA
    import numpy as np

    for setname in setnames + ["Full"]:
        moldescriptors_featues, Y, features_names = \
            commonutils.build_XY_matrix (fulldescriptors[setname], \
                                         labels[setname])

        pca = PCA(n_components=3)
        fit = pca.fit(moldescriptors_featues)
        # summarize components
        print("PCA for set: ", setname)
        print("Explained Variance: %s" % fit.explained_variance_ratio_)
        loadings = pd.DataFrame(pca.components_.T, \
                                columns=['PC1', 'PC2', 'PC3'], \
                                    index=features_names)
        print(loadings)
        print("")

In [None]:
#Test using PLS 
from sklearn.inspection import permutation_importance
import numpy as np
import models

mostimportantefeatures_persetname = {}

print("SetName , Comp. , RMSE Train, RMSE Test, RMSE Full, R2 Train, " + \
      "R2 Test, R2 Full, RMSE LOO, R2 LOO")
for setname in setnames + ["Full"]:
    mostimportantefeatures_persetname[setname] = []
    moldescriptors_featues, Y, features_names = \
    commonutils.build_XY_matrix (fulldescriptors[setname], \
                                 labels[setname])

    maxcomp = moldescriptors_featues.shape[1]
    # search fo the best number od components and build final model
    perc_split = 0.2
    ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
        models.pls_model (0.2, moldescriptors_featues, Y, \
                      ncomp_start = 1, ncomp_max = maxcomp)
    r2max_comps = np.argmax(r2s_test)+1
    rmsemin_comps = np.argmin(rmses_test)+1
    compstouse = min(rmsemin_comps, r2max_comps)

    perc_split = 0.2
    rmse_train, rmse_test, r2_train, r2_test, rmse_full, r2_full , \
        plsmodel, X_train, X_test, y_train, y_test  = \
            models.pls_model (0.2, moldescriptors_featues, Y, False, compstouse)
    perc_split = 0.0
    rmse, r2 = models.pls_model (perc_split, moldescriptors_featues, Y, False, \
                  compstouse, leaveoneout=True)
    
    print("%10s, %4d , %9.3f , %9.3f , %9.3f , %9.3f , %9.3f , %9.3f , %9.3f , %9.3f"%(\
        setname, compstouse, \
        rmse_train, rmse_test, rmse_full, \
        r2_train, r2_test, r2_full, \
        rmse, r2))

    scoring = 'neg_mean_squared_error'

    r = permutation_importance(plsmodel, X_test, y_test, n_repeats=30, \
                            random_state=0, scoring=scoring)
    
    for i in r.importances_mean.argsort()[::-1]:
        mostimportantefeatures_persetname[setname].append(features_names[i])

    if DEBUG:
        scoring = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']
    
        r_multi = permutation_importance(plsmodel, X_test, y_test, n_repeats=30, \
                                random_state=0, scoring=scoring)

        for metric in r_multi:
            print(f"{metric}"+ " Used")
            r = r_multi[metric]
            for i in r.importances_mean.argsort()[::-1]:
                if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
                    print(f"{features_names[i]:<30}"
                        f"{r.importances_mean[i]:.3e}"
                        f" +/- {r.importances_std[i]:.3e}")
            print("")

In [None]:
for setname in setnames + ["Full"]:
    print("Most important features for set: ", setname)
    for tc in mostimportantefeatures_persetname[setname]:
        print("%35s"%(tc))
    print("")
    for tc in top_correlation_perset[setname]:
        print("%35s %35s %9.3f"%(tc[0], tc[1], tc[2]))
#remove some features based on importance and correlation
"""
features_to_remove = ["HF_Nuclear_Repulsion", \
                      "HF_Two_Electron_Energy", \
                      "HF_One_Electron_Energy"]
featureset = ["hfenergydiff"]

commonutils.remove_features(allvalues, features_to_remove, featureset)
"""

In [None]:
#Test using PLS 
import models

fulldescriptors =[]
labels = []

for idx, val in enumerate(allvalues):
    fulldescriptors.append({})
    fulldescriptors[idx].update(val["pbeenergydiff"])
    fulldescriptors[idx].update(val["hfenergydiff"])

    labels.append(val["label"])

moldescriptors_featues, Y, features_names = \
    commonutils.build_XY_matrix (fulldescriptors, labels)

maxcomp = moldescriptors_featues.shape[1]
# search fo the best number od components and build final model
perc_split = 0.2
ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
    models.pls_model (0.2, moldescriptors_featues, Y, \
                      ncomp_start = 1, ncomp_max = maxcomp)
r2max_comps = np.argmax(r2s_test)+1
rmsemin_comps = np.argmin(rmses_test)+1
compstouse = min(rmsemin_comps, r2max_comps)

perc_split = 0.2
rmse_train, rmse_test, r2_train, r2_test, rmse_full, r2_full , \
        plsmodel, X_train, X_test, y_train, y_test  = \
        models.pls_model (0.2, moldescriptors_featues, Y, False, compstouse)
perc_split = 0.0
rmse, r2 = models.pls_model (perc_split, moldescriptors_featues, Y, False, \
                  compstouse, leaveoneout=True)

print("PLS model with %d components"%(compstouse))
print("Train RMSE: %9.3f"%(rmse_train))
print("Test  RMSE: %9.3f"%(rmse_test))
print("Full  RMSE: %9.3f"%(rmse_full))
print("Train R2  : %9.3f"%(r2_train))
print("Test  R2  : %9.3f"%(r2_test))
print("Full  R2  : %9.3f"%(r2_full))
print("LOO   RMSE: %9.3f"%(rmse))
print("LOO   R2  : %9.3f"%(r2))