In [6]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

from dataclasses import dataclass
import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy

In [7]:
@dataclass
class ModelResults:
    # model related data
    plsmodel = None
    # predicted values
    y_pred: list = None
    y_pred_bestourmethod: list = None
    y_pred_bestinsidemethod: list = None
    # data related to full set
    fulldescriptors: list = None
    labels: list = None
    top_correlation: list = None
    # data realated to inside and our methods
    inside_methods_rmse: list = None
    bestinsidemethod_rmse: float = 0.0
    bestinsidemethod: str = None
    inside_methods_r2: list = None
    our_methods_rmse: dict = None
    bestourmethod_rmse: float = 0.0
    bestourmethod: str = None
    our_methods_r2: dict = None
    our_methods_name : list = None

def read_and_init (inrootdir, supersetnames, howmanydifs, methods, \
                   DEBUG=False):
    
    allvalues_perset = {}
    fullsetnames = []
    models_results = {}

    toberemoved = {}
    for super_setname in supersetnames:
        toberemoved[super_setname] = []
        allvalues_perset[super_setname] = []
        fullsetnames.append(super_setname)
        for i, setname in enumerate(supersetnames[super_setname]):
              print("Reading dataset: ", setname)
              rootdir = inrootdir + super_setname + "/" +setname
              labelsfilename = inrootdir + setname +"_labels.txt"
        
              values =\
                    commonutils.read_dataset(rootdir, labelsfilename, \
                                             howmanydifs, methods, \
                                             debug=DEBUG)
              
              if (values is None) or (len(values) <= 2):
                    print(setname + " No data found for this dataset")
                    print("")
                    toberemoved[super_setname].append(i)
              else:
                    fullsetname = super_setname+"_"+setname
                    fullsetnames.append(fullsetname)
                    allvalues_perset[fullsetname] = values  
                    print("Number of samples: ", len(allvalues_perset[fullsetname]))
                    print("Number of basic descriptors: ", len(allvalues_perset[fullsetname]))

                    allvalues_perset[super_setname] += allvalues_perset[fullsetname]
                    print("")

    for super_setname in toberemoved:
        for i in sorted(toberemoved[super_setname], reverse=True):
          del supersetnames[super_setname][i]
 
    allvalues_perset["Full"] = []
    for super_setname in supersetnames:
          allvalues_perset["Full"] += allvalues_perset[super_setname]  
    fullsetnames.append("Full")

    for setname in fullsetnames:
        models_results[setname] = ModelResults()

    return allvalues_perset, fullsetnames, models_results


In [8]:
warnings.simplefilter("ignore")
CORRCUT = 0.98

supersetnames = {"BARRIER" : \
                       ["BH76","BHDIV10","BHPERI",\
                        "BHROT27","INV24","PX13","WCPT18"] \
                    ,"INTRA" : \
                       ["ACONF",'Amino20x4',"BUT14DIOL",\
                        "ICONF","IDISP","MCONF",\
                        "PCONF21","SCONF","UPU23"] , \
                    "SMALL" :\
                        ["AL2X6","ALK8","ALKBDE10","BH76RC",\
                         "DC13","DIPCS10","FH51","G21EA",\
                         "G21IP","G2RC","HEAVYSB11","NBPRC",\
                         "PA26","RC21","SIE4x4","TAUT15",\
                         "W4-11","YBDE18"], \
                    "INTER" :\
                       ["ADIM6","AHB21","CARBHB12",\
                        "CHB6","HAL59","HEAVY28","IL16",\
                        "PNICO23","RG18","S22","S66","WATER27"] , \
                    "LARGE" :\
                        ["BSR36","C60ISO","CDIE20","DARC",\
                         "ISO34","ISOL24","MB16-43","PArel",\
                            "RSE43"]}           

howmanydifs = 3
insidemethodslist = ['W','D3(0)','D3(BJ)']
energy_decomp_terms = ["Nuclear Repulsion  :","One Electron Energy:",\
                       "Two Electron Energy:","Potential Energy   :",\
                        "Kinetic Energy     :","E(X)               :",\
                        "E(C)               :","Dispersion correction",\
                        "FINAL SINGLE POINT ENERGY"]

energy_decomp_terms = ["FINAL SINGLE POINT ENERGY"]

methods_keys = ["PBE-MINIX","PBE-SVP","PBE-TZVP",\
                "PBE0-MINIX","PBE0-SVP","PBE0-TZVP",\
                "TPSS-MINIX","TPSS-SVP","TPSS-TZVP",\
                "TPSSh-MINIX","TPSSh-SVP","TPSSh-TZVP"]

methods_keys = ["PBE-MINIX"]     # This line allows to define the 
                                            # the functional-basis set data to use
                                            # in the calculations

methods = {key: energy_decomp_terms for key in methods_keys}

# read all the data and initialize the data structures
rootdir = "../Datasets/FullDataSet/"  
allvalues_perset, fullsetnames, models_results = \
        read_and_init (rootdir, supersetnames, howmanydifs, methods, \
                       DEBUG=True)

Reading dataset:  BH76
Number of samples:  76
Number of basic descriptors:  76

Reading dataset:  BHDIV10
Number of samples:  10
Number of basic descriptors:  10

Reading dataset:  BHPERI
Number of samples:  26
Number of basic descriptors:  26

Reading dataset:  BHROT27
Number of samples:  27
Number of basic descriptors:  27

Reading dataset:  INV24
Number of samples:  24
Number of basic descriptors:  24

Reading dataset:  PX13
Number of samples:  13
Number of basic descriptors:  13

Reading dataset:  WCPT18
Number of samples:  18
Number of basic descriptors:  18

Reading dataset:  ACONF
Number of samples:  15
Number of basic descriptors:  15

Reading dataset:  Amino20x4
Number of samples:  80
Number of basic descriptors:  80

Reading dataset:  BUT14DIOL
Number of samples:  64
Number of basic descriptors:  64

Reading dataset:  ICONF
Number of samples:  17
Number of basic descriptors:  17

Reading dataset:  IDISP
Number of samples:  6
Number of basic descriptors:  6

Reading dataset:  

In [9]:
# WTMAD_lf calc 
# Test with initial dictionary

# Creating the list to feed the WTMAD calc:

fullandsupersets = ["Full","SMALL","LARGE","BARRIER","INTER","INTRA"]
datasets = [dataset for sublist in supersetnames.values() for dataset in sublist]

id_list = []
lab_list = []
pred_list = [] # To test the function, calculations using PBE-MINIX are being used

for setname in fullsetnames:
    if setname not in fullandsupersets:
        for val in allvalues_perset[setname]:
            identifier = setname+'_'+str(val['chemicals'])
            id_list.append(identifier)
            lab_list.append(val['label'])
            pred_list.append(val['PBE-MINIX_energydiff']["PBE-MINIX_FINAL_SINGLE_POINT_ENERGY"])
            #pred_list.append(val['label']+val['difs'][2]) #This line allowed to test the WTMAD-2 calculation by operating over reference data


from commonutils import wtmad2_lf

# WTMAD2 calculation for reference values just as example:

wtmad2 = wtmad2_lf(id_list,lab_list,pred_list,fullandsupersets,datasets)

print(wtmad2)


       Set  WTMAD-2
0    SMALL    61.17
1    LARGE    43.15
2  BARRIER    24.64
3    INTER    20.81
4    INTRA     5.39
5     Full    45.90


In [10]:
for val in allvalues_perset:
    print("======= START =======")
    print(val, len(allvalues_perset[val]))
    pp.pprint(allvalues_perset[val])
    print("=======  END  =======")

BARRIER 194
[
    {
        'stechio_ceofs': [-1, -1, 1],
        'chemicals': ['h', 'n2o', 'n2ohts'],
        'label': 17.7,
        'difs': [-7.55, -7.74, -7.96],
        'PBE-MINIX_energydiff': {
            'PBE-MINIX_FINAL_SINGLE_POINT_ENERGY': -11.632145586501455
        }
    },
    {
        'stechio_ceofs': [-1, -1, 1],
        'chemicals': ['oh', 'n2', 'n2ohts'],
        'label': 82.6,
        'difs': [-30.06, -30.31, -30.75],
        'PBE-MINIX_energydiff': {
            'PBE-MINIX_FINAL_SINGLE_POINT_ENERGY': 61.015986088566265
        }
    },
    {
        'stechio_ceofs': [-1, -1, 1],
        'chemicals': ['h', 'hf', 'hfhts'],
        'label': 42.1,
        'difs': [-13.79, -13.84, -13.92],
        'PBE-MINIX_energydiff': {
            'PBE-MINIX_FINAL_SINGLE_POINT_ENERGY': 19.52816423405176
        }
    },
    {
        'stechio_ceofs': [-1, -1, 1],
        'chemicals': ['hf', 'h', 'hfhts'],
        'label': 42.1,
        'difs': [-13.79, -13.84, -13.92],
        'PBE-M

In [11]:
allfeatures = set()

metricsets = ["SMALL"]
includeFull = True

for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)
for setname in fullsetnames:
    models_results[setname].inside_methods_rmse = []
    models_results[setname].inside_methods_r2 = []
    models_results[setname].our_methods_rmse = {}
    models_results[setname].our_methods_r2 = {}
    
    models_results[setname].bestinsidemethod_rmse = float("inf")
    models_results[setname].bestinsidemethod = ""
    models_results[setname].bestourmethod_rmse = float("inf")
    models_results[setname].bestourmethod = ""
    models_results[setname].our_methods_name = []

    for methodid in range(howmanydifs):
        y_pred = []
        labels = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])
            labels.append(val["label"])
        
        r2 = r2_score(labels, y_pred)
        rmse = mean_squared_error(labels, y_pred, squared=False)
        models_results[setname].inside_methods_rmse.append(rmse)
        models_results[setname].inside_methods_r2.append(r2)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            #models_results[setname].bestinsidemethod = str(methodid)
            models_results[setname].bestinsidemethod = insidemethodslist[methodid]
            models_results[setname].y_pred_bestinsidemethod = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        labels = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])
            labels.append(val["label"])
        
        r2 = r2_score(labels, y_pred)
        rmse = mean_squared_error(labels, y_pred, squared=False)

        models_results[setname].our_methods_rmse[method] = rmse
        models_results[setname].our_methods_r2[method] = r2
        models_results[setname].our_methods_name.append(method)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod = method
            models_results[setname].y_pred_bestourmethod = y_pred


######################################################################################
# WTMAD2 Calculation for reference methods (wtmad_ref):

wtmad2_ref = commonutils.wtmad_ref(fullsetnames, metricsets, howmanydifs,allvalues_perset, includeFull)

print("WTMAD-2 for inside methods", "\n")
print(wtmad2_ref, '\n')

# WTMAD2 Calculation for our methods (wtmad):

wtmad2 = commonutils.wtmad(fullsetnames, metricsets, methods,allvalues_perset, includeFull)

print("WTMAD-2 for our methods", "\n")
print(wtmad2, "\n")


######################################################################################


print("Results for inside and our methods")
print("%40s"% "Dataset", " , ", \
      "Best inside method", " , ", \
      "RMSE", " , ", \
      "Best our method", " , ", \
      "RMSE")
for setname in fullsetnames:
    print("%40s"%setname, " , ", \
        models_results[setname].bestinsidemethod , " , ",\
        "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
        "%5s"%models_results[setname].bestourmethod , " , ", \
        "%7.3f"%models_results[setname].bestourmethod_rmse)


WTMAD-2 for inside methods 

  Superset  WTMAD-2_0  WTMAD-2_1  WTMAD-2_2
0    SMALL      13.26      13.01      13.31
1     Full      13.26      13.01      13.31 

WTMAD-2 for our methods 

  Superset  WTMAD-2_PBE-MINIX
0    SMALL              61.15
1     Full              61.15 

Results for inside and our methods
                                 Dataset  ,  Best inside method  ,  RMSE  ,  Best our method  ,  RMSE
                                 BARRIER  ,  W  ,    8.201  ,  PBE-MINIX  ,   28.824
                            BARRIER_BH76  ,  W  ,   10.387  ,  PBE-MINIX  ,   32.138
                         BARRIER_BHDIV10  ,  W  ,    9.191  ,  PBE-MINIX  ,   17.428
                          BARRIER_BHPERI  ,  W  ,    4.407  ,  PBE-MINIX  ,   20.851
                         BARRIER_BHROT27  ,  D3(0)  ,    0.611  ,  PBE-MINIX  ,    3.211
                           BARRIER_INV24  ,  D3(BJ)  ,    2.658  ,  PBE-MINIX  ,    5.864
                            BARRIER_PX13  ,  W  ,   11.730  ,  

In [None]:
setname = "Full"
models_results[setname].fulldescriptors = []
models_results[setname].labels = []
for idx, val in enumerate(allvalues_perset[setname]):
    models_results[setname].fulldescriptors.append({})
    for method in methods:
        models_results[setname].fulldescriptors[idx].update(val[method+"_energydiff"])

    models_results[setname].labels.append(val["label"])

X, Y, features_names = \
    commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                 models_results[setname].labels)

df = pd.DataFrame(X, columns=features_names)

corr = df.corr().abs()
top_correlation = {}
for k in corr.columns:
    top_correlation[k] = []
    print(k, " ")
    for idx, v in enumerate(corr[k]):
        if v > CORRCUT and corr.index[idx] != k:
            top_correlation[k].append((corr.index[idx], v))
            print(" %40s %4.2f"%(corr.index[idx], v))

In [None]:
featurestorms = set()
for tc in top_correlation:
    if tc not in featurestorms:
        for correlated in top_correlation[tc]:
            featurestorms.add(correlated[0])

print ("Features that are correlated with others and to remove")  
for idx, k in enumerate(featurestorms):
    print(idx+1 , " ", k)

In [None]:
print ("Features TO USE")
idx = 1
for f in allfeatures:
    if f not in featurestorms:
        print(idx, f) 
        idx += 1 

In [None]:
featurestorm = list(featurestorms)

for setname in fullsetnames:
    commonutils.remove_features_fromset(allvalues_perset[setname], \
                                            list(featurestorm), \
                                            methods)

for setname in fullsetnames:
    models_results[setname].fulldescriptors = []
    models_results[setname].labels = []
    for idx, val in enumerate(allvalues_perset[setname]):
        models_results[setname].fulldescriptors.append({})
        for method in methods:
            if method+"_energydiff" in val:
                models_results[setname].fulldescriptors[idx].update(\
                    val[method+"_energydiff"])

        models_results[setname].labels.append(val["label"])

setname = "Full"
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                 models_results[setname].labels)

df = pd.DataFrame(X, columns=features_names)

In [None]:
import seaborn as sns
%matplotlib inline
print("Correlation matrix")
plt.rcParams['figure.figsize'] = 10,10
sns.set(font_scale=2)
sns.heatmap(df.corr().abs(), annot=True)
#print(df.corr().abs())
#sns.heatmap(df, annot=True)

In [None]:
perc_split = 0.2
for setname in fullsetnames:
       print("Running PLS for dataset: ", setname)

       X, Y, features_names = \
          commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                  models_results[setname].labels)
       maxcomp = X.shape[1]
       ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
              models.pls_model (perc_split, X, Y, \
              ncomp_start = 1, ncomp_max = maxcomp)
       r2max_comps = np.argmax(r2s_test)+1
       rmsemin_comps = np.argmin(rmses_test)+1
       compstouse = min(rmsemin_comps, r2max_comps)
       rmse_train, rmse_test, r2_train, \
          r2_test, rmse_full, r2_full , \
          models_results[setname].plsmodel, \
          X_train, X_test, y_train, y_test  = \
                  models.pls_model (perc_split, X, Y, False, compstouse)
       
       models_results[setname].y_pred = \
          models_results[setname].plsmodel.predict(X) 

In [None]:
print(" Dim , %40s"% "Dataset", " , ", \
      "Best inside method RMSE", " , ", \
      "Best our method RMSE", " , ", \
      "RMSE , R2 , " + \
      "RMSE (superset) , R2 (superset) ," + \
      "RMSE (Full) , R2 (Full) ")
pls_model_full = models_results["Full"].plsmodel
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results["Full"].fulldescriptors, \
                                    models_results["Full"].labels)
y_pred = pls_model_full.predict(X)
rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)
print("%4d , %40s"%(len(models_results["Full"].labels), "Full"), " , ", \
    "%7.3f"%models_results["Full"].bestinsidemethod_rmse, " , ", \
    "%7.3f"%models_results["Full"].bestourmethod_rmse, " , ", \
    "%7.3f"%rmse, " , ", \
    "%7.3f"%r2, " , ", \
    "%7.3f"%rmse, " , ", \
    "%7.3f"%r2, " , ", \
    "%7.3f"%rmse, " , ", \
    "%7.3f"%r2)

for ssetname in supersetnames:
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[ssetname].fulldescriptors, \
                                    models_results[ssetname].labels)
    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)    

    y_pred_full = pls_model_full.predict(X) 
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)

    print("%4d , %40s"%(len(models_results[ssetname].labels), ssetname), " , ", \
        "%7.3f"%models_results[ssetname].bestinsidemethod_rmse, " , ", \
        "%7.3f"%models_results[ssetname].bestourmethod_rmse, " , ", \
        "%7.3f"%rmse, " , ", \
        "%7.3f"%r2, " , ", \
        "%7.3f"%rmse, " , ", \
        "%7.3f"%r2, " , ", \
        "%7.3f"%rmse_full, " , ", \
        "%7.3f"%r2_full)
    
    for isetname in supersetnames[ssetname]:
        setname = ssetname + "_" + isetname
        pls_model = models_results[setname].plsmodel    
        X, Y, features_names = \
            commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                    models_results[setname].labels)
        y_pred = pls_model.predict(X)
        rmse = mean_squared_error(Y, y_pred, squared=False)
        r2 = r2_score(Y, y_pred)

        y_pred_ssetname = pls_model_ssetname.predict(X)
        rmse_ssetname = mean_squared_error(Y, y_pred_ssetname, squared=False)
        r2_ssetname = r2_score(Y, y_pred_ssetname)

        y_pred_full = pls_model_full.predict(X)
        rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
        r2_full = r2_score(Y, y_pred_full)

        print("%4d , %40s"%(len(models_results[setname].labels), setname), " , ", \
            "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
            "%7.3f"%models_results[setname].bestourmethod_rmse, " , ", \
            "%7.3f"%rmse, " , ", \
            "%7.3f"%r2, " , ", \
            "%7.3f"%rmse_ssetname, " , ", \
            "%7.3f"%r2_ssetname, " , ", \
            "%7.3f"%rmse_full, " , ", \
            "%7.3f"%r2_full)


In [None]:
# test scaling Y
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results["Full"].fulldescriptors, \
                                    models_results["Full"].labels)
scalery = preprocessing.StandardScaler().fit(Y.reshape(-1, 1))   
Y_s = scalery.transform(Y.reshape(-1, 1))
scalerx = preprocessing.StandardScaler().fit(X)
X_s = scalerx.transform(X)
  
perc_split = 0.2
maxcomp = X_s.shape[1]

ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
              models.pls_model (perc_split, X_s, Y_s, \
              ncomp_start = 1, ncomp_max = maxcomp)
r2max_comps = np.argmax(r2s_test)+1
rmsemin_comps = np.argmin(rmses_test)+1
compstouse = min(rmsemin_comps, r2max_comps)
rmse_train, rmse_test, r2_train, \
    r2_test, rmse_full, r2_full , \
    plsmodel, \
    X_train, X_test, y_train, y_test  = \
        models.pls_model (perc_split, X_s, Y_s, False, compstouse)
       
y_pred_s = plsmodel.predict(X_s)
y_pred = scalery.inverse_transform(y_pred_s)

rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)

print("Scaling Y")
print("RMSE %7.3f"%rmse, " , ", "R2 %7.3f"%r2)  


In [None]:
setname = "Full"
pls_model_full = models_results[setname].plsmodel

for setname in fullsetnames:

    pls_model = models_results[setname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                    models_results[setname].labels)
    y_pred_full = pls_model_full.predict(X)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)

    y_pred = pls_model.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)

    print(setname)
    print("RMSE %7.3f"%rmse, " R2 %7.3f"%r2)
    print("RMSE (Full) %7.3f"%rmse_full, " R2 (Full) %7.3f"%r2_full)
    plt.clf()
    fig, ax = plt.subplots()
    ax.scatter(Y, y_pred, c='b', s=50, label='PLS Model')
    ax.scatter(Y, y_pred_full, c='black', s=50, label='PLS Full Model')
    ax.scatter(Y, models_results[setname].y_pred_bestinsidemethod, \
                c='r', s=50, label=f"ref-PBE-{models_results[setname].bestinsidemethod}")
    ax.scatter(Y, models_results[setname].y_pred_bestourmethod, \
               c='g', s=50, label=f"{models_results[setname].bestourmethod}")
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.legend(loc="upper left")
    plt.title(f"PLS {setname} model")
    plt.show()
