In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import numpy as np 
import re

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Define basic data structures and config

In [None]:
howmanydifs = 3
rootdirqdata = '../datasets/qdata/'
rootdirdata = '../datasets/data/'

In [None]:
import models
import commonutils

molnames, labels, diffs_toothermethods, chemical_reacts, \
        stechio_ceofs, moldescriptors, chemicals_descriptors , \
        pbe_hf_nonenergy_descriptors, pbe_diff_energy_descriptors, \
        hf_diff_energy_descriptors = \
            commonutils.readandcheckdata(rootdirqdata, rootdirdata, howmanydifs)   

print("")
print("Number of descriptors:", len(moldescriptors[0]))
print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))
print("")
print("Number of chemicals descriptors:", len(chemicals_descriptors))
print("")
print("Number of pbe_hf_nonenergy_descriptors:", len(pbe_hf_nonenergy_descriptors))
print("Number of pbe_diff_energy_descriptors :", len(pbe_diff_energy_descriptors))
print("Number of hf_diff_energy_descriptors  :", len(hf_diff_energy_descriptors))

Check error respect to QM methods

In [None]:
for methodid in range(howmanydifs):
    y_pred = []
    for i, molname in enumerate(molnames):
        y_pred.append(labels[i] + diffs_toothermethods[i][methodid])

    # plot a scatterplot of the true vs predicted values
    #plt.figure(figsize=(10,10))
    #plt.scatter(labels, y_pred, c='crimson')
    #plt.xlabel('True')
    #plt.ylabel('Predicted Values')
    
    print("Method", methodid+1, "R2 score  :", r2_score(labels, y_pred))
    print("Method", methodid+1, "RMSE score:", mean_squared_error(labels, y_pred, squared=False))

y_pred = []
autokcalmol = 627.5096080305927
for mi, molname in enumerate(molnames):
    #print(molname)
    oury = moldescriptors[mi]["PBE_FINAL_SINGLE_POINT_ENERGY"]
    si = 1
    tosub = 0.0
    for ci, chem in enumerate(chemical_reacts[mi]):
        stecchio = stechio_ceofs[mi][si]
        tosub += stecchio*chemicals_descriptors[chem]["PBE_FINAL_SINGLE_POINT_ENERGY"]
        si += 1
        
    y_pred.append(autokcalmol*(tosub-oury))
    #print(molname, oury, tosub, 627.51* (tosub-oury), labels[mi])

# plot a scatterplot of the true vs predicted values
#plt.figure(figsize=(10,10))
#plt.scatter(labels, y_pred, c='crimson')
#plt.xlabel('True')
#plt.ylabel('Predicted Values')

print("")
print("Our PBE R2 score  :", r2_score(labels, y_pred))
print("Our PBE RMSE score:", mean_squared_error(labels, y_pred, squared=False))  
     

Some correlation

In [None]:
fulldescriptors =[]

for idx, _ in enumerate(molnames):
    fulldescriptors.append({})
    fulldescriptors[idx].update(pbe_hf_nonenergy_descriptors[idx])
    fulldescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fulldescriptors[idx].update(pbe_diff_energy_descriptors[idx])

moldescriptors_featues, Y, features_names = \
    commonutils.build_features_matrix_and_labels (molnames, fulldescriptors, labels)

df = pd.DataFrame(moldescriptors_featues, columns=features_names)

top_corr = commonutils.get_top_correlations_blog(df, 0.8)
for tc in top_corr:
    print(tc)


PLS models 

In [None]:

def run_all_pls (moldescriptors_featues, Y):
    maxcomp = moldescriptors_featues.shape[1]
    # search fo the best number od components and build final model
    perc_split = 0.2
    ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
        models.pls_model (0.2, moldescriptors_featues, Y, \
                      ncomp_start = 1, ncomp_max = maxcomp)
    r2max_comps = np.argmax(r2s_test)+1
    rmsemin_comps = np.argmin(rmses_test)+1
    compstouse = min(rmsemin_comps, r2max_comps)

    perc_split = 0.2
    rmse_train, rmse_test, r2_train, r2_test, rmse_full, r2_full , \
        plsmodel, X_train, X_test, y_train, y_test  = \
        models.pls_model (0.2, moldescriptors_featues, Y, False, compstouse)
    perc_split = 0.0
    rmse, r2 = models.pls_model (perc_split, moldescriptors_featues, Y, False, \
                  compstouse, leaveoneout=True)
    
    return compstouse, rmse, r2, rmse_full, r2_full, rmse_test, r2_test, rmse_train, r2_train, \
        plsmodel, X_train, X_test, y_train, y_test

fulldescriptors =[]
for idx, _ in enumerate(molnames):
    fulldescriptors.append({})
    fulldescriptors[idx].update(pbe_hf_nonenergy_descriptors[idx])
    fulldescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fulldescriptors[idx].update(pbe_diff_energy_descriptors[idx])

fullenergydescriptors = {}
for idx, _ in enumerate(molnames):
    fullenergydescriptors[idx] = {}
    fullenergydescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fullenergydescriptors[idx].update(pbe_diff_energy_descriptors[idx])

print("Num. of Comp , LoO RMSE , LoO R2 , RMSE , R2 , RMSE Test , R2 Test , RMSE Train , R2 Train")
for desctouse in  [pbe_diff_energy_descriptors, hf_diff_energy_descriptors, \
                   fulldescriptors, fullenergydescriptors]:
    moldescriptors_featues, Y, features_names = \
        commonutils.build_features_matrix_and_labels (molnames, desctouse , labels)
    compstouse, rmse, r2, rmse_full, r2_full, rmse_test, r2_test, rmse_train, r2_train , \
        plsmodel, X_train, X_test, y_train, y_test = \
        run_all_pls (moldescriptors_featues, Y) 
    print (compstouse, " , " ,\
            rmse, " , " ,\
            r2, " , " ,\
            rmse_full, " , " ,\
            r2_full, " , " ,\
            rmse_test, " , " ,\
            r2_test, " , " ,\
            rmse_train, " , " ,\
            r2_train )


PCA to see how many components

In [None]:
from sklearn.decomposition import PCA

for desctouse in  [pbe_diff_energy_descriptors, hf_diff_energy_descriptors, \
                   fulldescriptors, fullenergydescriptors]:
    X, Y, features_names = \
        commonutils.build_features_matrix_and_labels (molnames, desctouse , labels)
    pca = PCA(n_components=3)
    fit = pca.fit(X)
    # summarize components
    print("Explained Variance: %s" % fit.explained_variance_ratio_)
    #print(fit.components_)

Try using Permutation feature importance

In [None]:
from sklearn.inspection import permutation_importance

for desctouse in  [pbe_diff_energy_descriptors, hf_diff_energy_descriptors, \
                   fulldescriptors, fullenergydescriptors]:
    moldescriptors_featues, Y, features_names = \
        commonutils.build_features_matrix_and_labels (molnames, desctouse , labels)
    compstouse, rmse, r2, rmse_full, r2_full, rmse_test, r2_test, rmse_train, r2_train , \
        plsmodel, X_train, X_test, y_train, y_test = \
        run_all_pls (moldescriptors_featues, Y) 
    
    scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error']
    
    r_multi = permutation_importance(plsmodel, X_test, y_test, n_repeats=30, \
                                random_state=0, scoring=scoring)
    
    for metric in r_multi:
        print(f"{metric}")
        r = r_multi[metric]
        for i in r.importances_mean.argsort()[::-1]:
            if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
                print(f"{features_names[i]:<30}"
                  f"{r.importances_mean[i]:.3e}"
                  f" +/- {r.importances_std[i]:.3e}")
        print("")


Focus on fulldescriptors using only energy

In [None]:
fullenergydescriptors = {}
for idx, _ in enumerate(molnames):
    fullenergydescriptors[idx] = {}
    fullenergydescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fullenergydescriptors[idx].update(pbe_diff_energy_descriptors[idx])

print("Num. of Comp , LoO RMSE , LoO R2 , RMSE , R2 , RMSE Test , R2 Test , RMSE Train , R2 Train")

moldescriptors_featues, Y, features_names = \
        commonutils.build_features_matrix_and_labels (molnames, fullenergydescriptors , labels)
compstouse, rmse, r2, rmse_full, r2_full, rmse_test, r2_test, rmse_train, r2_train , \
        plsmodel, X_train, X_test, y_train, y_test = \
        run_all_pls (moldescriptors_featues, Y) 
print (compstouse, " , " ,\
            rmse, " , " ,\
            r2, " , " ,\
            rmse_full, " , " ,\
            r2_full, " , " ,\
            rmse_test, " , " ,\
            r2_test, " , " ,\
            rmse_train, " , " ,\
            r2_train )

scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error']
    
r_multi = permutation_importance(plsmodel, X_test, y_test, n_repeats=30, \
                                random_state=0, scoring=scoring)
    
for metric in r_multi:
    print(f"{metric}")
    r = r_multi[metric]
    for i in r.importances_mean.argsort()[::-1]:
        if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
            print(f"{features_names[i]:<30}"
                f"{r.importances_mean[i]:.3e}"
                f" +/- {r.importances_std[i]:.3e}")
    print("")

Manual prediction 

In [None]:
X = X_test.copy()
X -= plsmodel._x_mean
X /= plsmodel._x_std
Ypred = np.dot(X, plsmodel.coef_)
#Ypred = np.dot(X, plsmodel.coef_.T)
Ypred += plsmodel._y_mean

print(Ypred)
plsmodel.predict(X_test)

Test using NN

In [None]:
fullenergydescriptors = {}
for idx, _ in enumerate(molnames):
    fullenergydescriptors[idx] = {}
    fullenergydescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fullenergydescriptors[idx].update(pbe_diff_energy_descriptors[idx])

moldescriptors_featues, Y, features_names = \
        commonutils.build_features_matrix_and_labels (molnames, fullenergydescriptors , labels)
perc_split = 0.20

modelshapes = [[2, 32, 64, 128, 32], \
                [2, 16, 32, 64, 128, 32],\
                [2, 16, 32, 64, 128, 32, 16],\
                [2, 8, 16, 32, 64, 32, 16, 8],\
                [ 8,  8,  8,  8, 8],\
                [16, 16, 16, 16, 16],\
                [32, 32, 32, 32, 32],\
                [64, 64, 64, 64, 64],\
                [128, 128, 128, 128, 128],\
                [ 8,  8,  8,  8],\
                [16, 16, 16, 16],\
                [32, 32, 32, 32],\
                [64, 64, 64, 64],\
                [128, 128, 128, 128],\
                [ 8,  8,  8],\
                [16, 16, 16],\
                [32, 32, 32],\
                [64, 64, 64],\
                [128, 128, 128]]

min_rmsemodel, maxr2_model = \
    models.nn_model (perc_split, moldescriptors_featues, Y, 
                    nepochs=[10, 20], \
                    modelshapes=modelshapes, \
                    batch_sizes=[8, 16, 32], \
                    inputshape=-1, \
                    search = True)

Test using RF

In [None]:
fullenergydescriptors = {}
for idx, _ in enumerate(molnames):
    fullenergydescriptors[idx] = {}
    fullenergydescriptors[idx].update(hf_diff_energy_descriptors[idx])
    fullenergydescriptors[idx].update(pbe_diff_energy_descriptors[idx])

moldescriptors_featues, Y, features_names = \
        build_features_matrix_and_labels (molnames, fullenergydescriptors , labels)
perc_split = 0.20
min_train_rmse_hyper, min_test_rmse_hyper, max_train_r2_hyper, max_test_r2_hyper = \
    models.rf_model (perc_split, moldescriptors_featues, Y, search = True)

In [None]:
for d in [min_train_rmse_hyper, min_test_rmse_hyper, max_train_r2_hyper, max_test_r2_hyper]:
    perc_split = 0.2
    train_rmse, test_rmse, r2_train, r2_test, model, \
              X_train, X_test, y_train, y_test = \
        models.rf_model (0.2, moldescriptors_featues, Y, False, [d['n_estimators']], \
                     [d['max_depth']], [d['min_samples_split']], \
                     [d['min_samples_leaf']], [d['random_state']], \
                     [d['bootstrap']], [d['max_features']])
    
    print (train_rmse, " , " ,\
           test_rmse , " , " ,\
           r2_train , " , " ,\
           r2_test)