In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

from dataclasses import dataclass
import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy
import pickle

In [None]:
warnings.simplefilter("ignore")

howmanydifs = 3
allvalues_perset = pickle.load(open("allvalues_perset.p", "rb"))
methods = pickle.load(open("methods.p", "rb"))
fullsetnames = pickle.load(open("fullsetnames.p", "rb"))
functionals = pickle.load(open("functionals.p", "rb"))
basis_sets = pickle.load(open("basis_sets.p", "rb"))
supersetnames = pickle.load(open("supersetnames.p", "rb"))

In [None]:
# #for debug purposes
# for val in allvalues_perset:
#    print("======= START =======")
#    print(val, len(allvalues_perset[val]))
#    pp.pprint(allvalues_perset[val])
#    print("=======  END  =======")

In [None]:
# # Testing of WTMAD-2 function

# fullandsupersets = list(supersetnames.keys())
# fullandsupersets.append("Full")

# id_list = []
# lab_list = []
# pred_list = [] # To test the function, calculations using PBE-MINIX are being used

# for setname in fullsetnames:
#     if setname not in fullandsupersets:
#         for val in allvalues_perset[setname]:
#             id_list.append(setname)
#             lab_list.append(val['label'])
#             #pred_list.append(val['PBE-MINIX_energydiff']["PBE-MINIX_FINAL_SINGLE_POINT_ENERGY"])
#             pred_list.append(val['label']+val['difs'][2]) #This line allowed to test the WTMAD-2 calculation by operating over reference data


# from commonutils import wtmad2

# # WTMAD2 calculation for reference values just as example:

# wtmad2 = wtmad2(id_list,lab_list,pred_list)

# print(wtmad2)

In [None]:
from importlib import reload
reload(commonutils)

from commonutils import ModelResults

allfeatures = set()
for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)

# set labels and sets lists
models_results = {}
for setname in fullsetnames:
    models_results[setname] = ModelResults()
    for val in allvalues_perset[setname]:
        models_results[setname].labels.append(val["label"]) 
        models_results[setname].supersetnames.append(val["super_setname"])
        models_results[setname].setnames.append(val["super_setname"]+"_"+val["setname"])


insidemethods = ["W","D3(0)","D3(BJ)"]
superset_list = list(supersetnames.keys())
superset_list.append("Full")

for setname in fullsetnames:
    for methodid in range(howmanydifs):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])

        if setname in superset_list:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                        models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestinsidemethod_wtmad:
                models_results[setname].bestinsidemethod_wtmad = wtmad
                models_results[setname].bestinsidemethod_name_wtmad = insidemethods[methodid]
                models_results[setname].y_pred_bestinsidemethod_wtmad = y_pred

        else:
            models_results[setname].bestinsidemethod_wtmad = float("NaN")
            models_results[setname].bestinsidemethod_name_wtmad = ""
            models_results[setname].y_pred_bestinsidemethod_wtmad = y_pred

        rmse = mean_squared_error(models_results[setname].labels, \
                                y_pred, squared=False)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            models_results[setname].bestinsidemethod_name_rmse = insidemethods[methodid]
            models_results[setname].y_pred_bestinsidemethod_rmse = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])

        if setname in superset_list:

            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                    models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestourmethod_wtmad:
                models_results[setname].bestourmethod_wtmad = wtmad
                models_results[setname].bestourmethod_name_wtmad = method
                models_results[setname].y_pred_bestourmethod_wtmad = y_pred

        else:
            models_results[setname].bestourmethod_wtmad = float("NaN")
            models_results[setname].bestourmethod_name_wtmad = ""
            models_results[setname].y_pred_bestourmethod_wtmad = y_pred
        
        rmse = mean_squared_error(models_results[setname].labels,\
                                y_pred, squared=False)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod_name_rmse = method
            models_results[setname].y_pred_bestourmethod_rmse = y_pred

print("Results for inside and our methods")
print("%40s"% "Dataset", " , ", \
    "Best inside method RMSE", " , ", \
    "RMSE", " , ", \
    "Best inside method WTMAD2", " , ", \
    "WTMAD2", " , ", \
    "Best our method RMSE", " , ", \
    "RMSE", " , ", \
    "Best our method WTMAD2", " , ", \
    "WTMAD2")
for setname in superset_list:
    print("%40s"%setname, " , ", \
        "%10s"%models_results[setname].bestinsidemethod_name_rmse , " , ",\
        "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
        "%10s"%models_results[setname].bestinsidemethod_name_wtmad , " , ", \
        "%7.3f"%models_results[setname].bestinsidemethod_wtmad, " , ", \
        "%10s"%models_results[setname].bestourmethod_name_rmse , " , ", \
        "%7.3f"%models_results[setname].bestourmethod_rmse, " , ", \
        "%10s"%models_results[setname].bestourmethod_name_wtmad , " , ", \
        "%7.3f"%models_results[setname].bestourmethod_wtmad)

In [None]:
#build descriptors 
reference_basisset = "QZVP"
selected_functional = "TPSS"
selected_basissets = ["TZVP","QZVP"]
functionals = ["TPSS"]
descriptors_to_use = ["Kinetic_Energy","Potential_Energy",\
                      "Dispersion_correction"]

#descriptors_to_use = ["All"]

hamiltonian = True
hamiltonian_terms = ["Kinetic_Energy","Nuclear_Repulsion",\
                     "One_Electron_Energy","Two_Electron_Energy",\
                     "Dispersion_correction"]

energy_decomposition = False
edecomp_terms = ["Kinetic_Energy","Nuclear_Repulsion",\
                 "One_Electron_Energy","Two_Electron_Energy",\
                 "E(X)","E(C)","Dispersion_correction"]

if hamiltonian == True:
    descriptors_to_use = hamiltonian_terms

if energy_decomposition == True:
    descriptors_to_use = edecomp_terms

for setname in fullsetnames:
    descriptors = {}
    for val in allvalues_perset[setname]:
        for func in functionals:
            for basis in selected_basissets:
                if basis == reference_basisset and func == selected_functional:
                    ham = func + "-" + basis
                    k = func + "-" + basis + "_energydiff"
                    for k2 in val[k]:
                        if any(k2.endswith(descr) for descr in descriptors_to_use):
                            if k2 not in descriptors:
                                descriptors[k2] = [val[k][k2]]
                            else:
                                descriptors[k2].append(val[k][k2])
                        elif descriptors_to_use[0]=="All":
                            if k2 not in descriptors:
                                descriptors[k2] = [val[k][k2]]
                            else:
                                descriptors[k2].append(val[k][k2])

                    if hamiltonian == True and energy_decomposition == False:

                        descriptors[ham+"_Electron_Repulsion"]=\
                            descriptors[ham+"_Two_Electron_Energy"]
                        
                        descriptors[ham+"_Nuclei_Electron_Attraction"]=\
                            [one-k for one,k in zip(descriptors[ham+"_One_Electron_Energy"],\
                                                    descriptors[ham+"_Kinetic_Energy"])]
                            

                    if energy_decomposition == True and hamiltonian == False:

                        descriptors[ham+"_Coulomb_Interaction"]=\
                            [two-X-C for two,X,C in zip(descriptors[ham+"_Two_Electron_Energy"],\
                                                        descriptors[ham+"_E(X)"],descriptors[ham+"_E(C)"])]
                        
                        descriptors[ham+"_Nuclei_Electron_Attraction"]=\
                            [one-k for one,k in zip(descriptors[ham+"_One_Electron_Energy"],\
                                                    descriptors[ham+"_Kinetic_Energy"])]


                else:
                    refk  = selected_functional + "-" + reference_basisset + "_energydiff"
                    k = func + "-" + basis + "_energydiff"
                    ham = func + "-" + basis
                    for k2 in val[k]:
                        if any(k2.endswith(descr) for descr in descriptors_to_use):
                            refk2 = k2.replace(basis, reference_basisset)
                            refk2 = refk2.replace(func, selected_functional)
                            newk2 = k2 + "_difftoref"
                            if newk2 not in descriptors:
                                descriptors[newk2] = [val[refk][refk2] - val[k][k2]]
                            else:
                                descriptors[newk2].append(val[refk][refk2] - val[k][k2])
                        elif descriptors_to_use[0]=="All":
                            refk2 = k2.replace(basis, reference_basisset)
                            refk2 = refk2.replace(func, selected_functional)
                            newk2 = k2 + "_difftoref"
                            if newk2 not in descriptors:
                                descriptors[newk2] = [val[refk][refk2] - val[k][k2]]
                            else:
                                descriptors[newk2].append(val[refk][refk2] - val[k][k2])

                    if hamiltonian == True and energy_decomposition == False:

                        descriptors[ham+"_Electron_Repulsion_difftoref"]=\
                            descriptors[ham+"_Two_Electron_Energy_difftoref"]
                        
                        descriptors[ham+"_Nuclei_Electron_Attraction_difftoref"]=\
                            [one-k for one,k in zip(descriptors[ham+"_One_Electron_Energy_difftoref"],\
                                                    descriptors[ham+"_Kinetic_Energy_difftoref"])]

                    if energy_decomposition == True and hamiltonian == False:
                        
                        descriptors[ham+"_Coulomb_Interaction_difftoref"]=\
                            [two-X-C for two,X,C in zip(descriptors[ham+"_Two_Electron_Energy_difftoref"],\
                                                        descriptors[ham+"_E(X)_difftoref"],descriptors[ham+"_E(C)_difftoref"])]
                        
                        descriptors[ham+"_Nuclei_Electron_Attraction_difftoref"]=\
                            [one-k for one,k in zip(descriptors[ham+"_One_Electron_Energy_difftoref"],\
                                                    descriptors[ham+"_Kinetic_Energy_difftoref"])]


    if hamiltonian == True or energy_decomposition == True:
        extra_descriptors = ["Two_Electron_Energy","One_Electron_Energy"]
        keys_to_delete = [key for key in descriptors if any(exdesc in key for exdesc in extra_descriptors)]
        for key in keys_to_delete:
            del descriptors[key]

    models_results[setname].features = descriptors
    #print("Descriptors for ", setname)
    #for k in descriptors:
    #    print(k, len(descriptors[k]), descriptors[k])

# feastures selection
setname = "Full"
numoffeat = len(models_results[setname].features)
print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

toremove = []
setname = "Full"
for k in models_results[setname].features:
    if len(set(models_results[setname].features[k])) == 1:
        toremove.append(k)
        print("Constant fatures to remove: ", k)

# remove constant values
for setname in fullsetnames:
    #print("Removing constant features for ", setname)
    for k in toremove:
        #print("Constant fatures to remove: ", k)
        del models_results[setname].features[k]

# test print for debug
#for setname in fullsetnames:
#    print("Descriptors for ", setname)
#    for k in models_results[setname].features:
#        print(k, len(models_results[setname].features[k]), \
#           models_results[setname].features[k])

# force removing features Nuclear Repulsion difference
print("Removing Nuclear Repulsion difference")
for setname in fullsetnames: 
    toremove = []
    for k in models_results[setname].features:
        if k.find("Nuclear_Repulsion_difftoref") != -1:
            toremove.append(k)
    for k in toremove:
        #print("Removing feature ", k)
        del models_results[setname].features[k]

setname = "Full"
numoffeat = len(models_results[setname].features)
print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

In [None]:
#for setname in fullsetnames:
#    print("Descriptors for ", setname)
#    for k in models_results[setname].features:
#        print(k, len(models_results[setname].features[k]), \
#           models_results[setname].features[k])

In [None]:
# remove corralted features 
CORRCUT = 1

setname = "Full"
touse = set()
# add by default the selected FINAL_SINGLE_POINT_ENERGY
toremove = set()
df = pd.DataFrame(models_results[setname].features)
corr = df.corr().abs()
for i, k in enumerate(corr.columns):
    print(i+1, " - ", k, " ", i)
    if k not in toremove:
        touse.add(k)
    for idx, v in enumerate(corr[k]):
        if v > CORRCUT and idx > i:
            print(" %60s %4.2f"%(corr.index[idx], v))
            if corr.index[idx] not in touse:
                toremove.add(corr.index[idx])

print("Features to use")
for i, feat in enumerate(touse):
    print(i+1 ,  " - " , feat)

for setname in fullsetnames:
    for k in touse:
        models_results[setname].uncorrelated_features[k] = \
            deepcopy(models_results[setname].features[k])

In [None]:
#for setname in fullsetnames:
#    print("Descriptors for ", setname)
#    i = 1
#    for k in models_results[setname].features:
#        print(i, " - ", k, len(models_results[setname].features[k]), \
#           models_results[setname].features[k])
#        i += 1

import seaborn as sns
%matplotlib inline

setname = "Full"
df = pd.DataFrame(models_results[setname].uncorrelated_features)
print("Correlation matrix")
plt.rcParams['figure.figsize'] = 60,60
sns.set(font_scale=2)
sns.heatmap(df.corr().abs(), annot=True)
#print(df.corr().abs())
#sns.heatmap(df, annot=True)

In [None]:
import importlib
importlib.reload(models)
importlib.reload(commonutils)

perc_split = 0.2
for setname in list(supersetnames)+["Full"]:
   print("Running PLS for dataset: ", setname)

   X, Y, features_names = \
      commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
              models_results[setname].labels)
   setlist = []
   for i, s in enumerate(models_results[setname].setnames):
        ss = models_results[setname].supersetnames[i]
        setlist.append(s)  
   supersetlist = models_results[setname].supersetnames
   maxcomp = X.shape[1]
   ncomps, rmses, r2s, wtmads = \
          models.pls_model (perc_split, X, Y, supersetlist, setlist, \
          ncomp_start = 1, ncomp_max = maxcomp, split = False)
   r2max_comps = np.argmax(r2s)+1
   rmsemin_comps = np.argmin(rmses)+1
   wtmadmin_comps = np.argmin(wtmads)+1
   print("Best number of components for R2: ", r2max_comps)
   print("Best number of components for RMSE: ", rmsemin_comps)
   print("Best number of components for WTMAD: ", wtmadmin_comps)
   compstouse = wtmadmin_comps
   rmse_train, rmse_test, r2_train, \
      r2_test, rmse_full, r2_full , \
      models_results[setname].plsmodel, \
      X_train, X_test, y_train, y_test  = \
              models.pls_model (perc_split, X, Y, supersetlist, setlist, \
                                False, compstouse, split = False)
   
   models_results[setname].y_pred = \
      models_results[setname].plsmodel.predict(X) 

In [None]:
print(" Dim , %40s"% "Dataset", " , ", \
      "Best inside method RMSE", " , ", \
      "Best our method RMSE", " , ", \
      "RMSE (superset) ," + \
      "RMSE (Full)")
pls_model_full = models_results["Full"].plsmodel
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results["Full"].uncorrelated_features, \
                                    models_results["Full"].labels)
y_pred = pls_model_full.predict(X)
rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)
print("%4d , %40s"%(len(models_results["Full"].labels), "Full"), " , ", \
    "%7.3f"%models_results["Full"].bestinsidemethod_rmse, " , ", \
    "%7.3f"%models_results["Full"].bestourmethod_rmse, " , ", \
    "%7.3f"%rmse, " , ", \
    "%7.3f"%rmse)

for ssetname in supersetnames:
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[ssetname].uncorrelated_features, \
                                    models_results[ssetname].labels)
    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)

    y_pred_full = pls_model_full.predict(X) 
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)

    print("%4d , %40s"%(len(models_results[ssetname].labels), ssetname), " , ", \
        "%7.3f"%models_results[ssetname].bestinsidemethod_rmse, " , ", \
        "%7.3f"%models_results[ssetname].bestourmethod_rmse, " , ", \
        "%7.3f"%rmse, " , ", \
        "%7.3f"%rmse_full)
    
    for isetname in supersetnames[ssetname]:
        setname = ssetname + "_" + isetname 
        X, Y, features_names = \
            commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)

        y_pred_ssetname = pls_model_ssetname.predict(X)
        rmse_ssetname = mean_squared_error(Y, y_pred_ssetname, squared=False)

        y_pred_full = pls_model_full.predict(X)
        rmse_full = mean_squared_error(Y, y_pred_full, squared=False)

        print("%4d , %40s"%(len(models_results[setname].labels), setname), " , ", \
            "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
            "%7.3f"%models_results[setname].bestourmethod_rmse, " , ", \
            "%7.3f"%rmse_ssetname, " , ", \
            "%7.3f"%rmse_full)


In [None]:
setname = "Full"
pls_model_full = models_results[setname].plsmodel
printonlysuperset = True
setnametouse = deepcopy(fullsetnames)
setnametouse.remove("Full")

ypredFull = []
setnamesFull = []

for setname in setnametouse:
    if setname in supersetnames:
        ssetname = setname  
    else:    
        lastunder = setname.rfind("_")
        ssetname = setname[:lastunder]
    
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)
    setlist = []
    for i, s in enumerate(models_results[setname].setnames):
        ss = models_results[setname].supersetnames[i]
        setlist.append(s)   
    y_pred_full = pls_model_full.predict(X)
    if len(y_pred_full.shape) == 2:
        y_pred_full = y_pred_full[:,0]
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)

    y_pred = pls_model_ssetname.predict(X)
    if len(y_pred.shape) == 2:
        y_pred = y_pred[:,0]
    rmse = mean_squared_error(Y, y_pred, squared=False)
    
    if setname in supersetnames:
        ypredFull.extend(list(y_pred))
        setnamesFull.extend(setlist)
        print("Results for ", setname, " dim: ", len(Y))
        wtmad2df = commonutils.wtmad2(setlist, Y, y_pred)
        wtmad2_fulldf = commonutils.wtmad2(setlist, Y, y_pred_full)
        wtmad2 = wtmad2df[setname]
        wtmad2_full = wtmad2_fulldf[setname] #When analysing supersets, WTMAD2 SS and Full should be equal
        print("WTMAD2","%15s,%7.3f"%("PLS Super Set",wtmad2))
        print("WTMAD2","%15s,%7.3f"%("PLS Full",wtmad2_full))
        print("WTMAD2","%15s,%7.3f"%(models_results[setname].bestinsidemethod_name_wtmad, models_results[setname].bestinsidemethod_wtmad))
        print("WTMAD2","%15s,%7.3f"%(models_results[setname].bestourmethod_name_wtmad, models_results[setname].bestourmethod_wtmad))
        
    if printonlysuperset and setname not in list(supersetnames.keys()) + ["Full"]:
        continue

    print("RMSE  ","%15s,%7.3f"%("PLS Super Set",rmse))
    print("RMSE  ","%15s,%7.3f"%("PLS Full", rmse_full))
    print("RMSE  ","%15s,%7.3f"%(models_results[setname].bestinsidemethod_name_rmse,models_results[setname].bestinsidemethod_rmse))
    print("RMSE  ","%15s,%7.3f"%(models_results[setname].bestourmethod_name_rmse,models_results[setname].bestourmethod_rmse))
    
    plt.clf()
    plt.rcParams['figure.figsize'] = 10,10
    fig, ax = plt.subplots()
    ax.scatter(Y, y_pred, c='b', s=50, label='PLS Super Set model')
    ax.scatter(Y, y_pred_full, c='r', s=50, label='PLS Full model')
    #ax.scatter(Y, models_results[setname].y_pred_bestinsidemethod, \
    #            c='r', s=50, label='Best inside method')
    ax.scatter(Y, models_results[setname].y_pred_bestourmethod_rmse, \
               c='g', s=50, label=f"{models_results[setname].bestourmethod_name_rmse}")
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.legend(loc="upper left")
    plt.title(setname)
    plt.show()

print("Results for Full sim ", len(ypredFull))
X, Y, features_names = \
        commonutils.build_XY_matrix (models_results["Full"].uncorrelated_features, \
                                    models_results["Full"].labels)
setlist = []
for i, s in enumerate(models_results["Full"].setnames):
    ss = models_results["Full"].supersetnames[i]
    setlist.append(s)   
wtmad2df = commonutils.wtmad2(setnamesFull, Y, ypredFull)
wtmad2 = wtmad2df["Full"]
rmse = mean_squared_error(Y, ypredFull, squared=False)

y_pred_full = pls_model_full.predict(X)
if len(y_pred_full.shape) == 2:
    y_pred_full = y_pred_full[:,0]
rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
wtmad2_fulldf = commonutils.wtmad2(setnamesFull, Y, y_pred_full)
wtmad2_full = wtmad2_fulldf["Full"]

print("WTMAD2","%15s,%7.3f"%("PLS Full",wtmad2_full))
print("WTMAD2","%15s,%7.3f"%("PLS Super Set",wtmad2))
print("WTMAD2","%15s,%7.3f"%(models_results['Full'].bestinsidemethod_name_wtmad,models_results["Full"].bestinsidemethod_wtmad))
print("WTMAD2","%15s,%7.3f"%(models_results['Full'].bestourmethod_name_wtmad, models_results["Full"].bestourmethod_wtmad))

rmse = mean_squared_error(models_results["Full"].labels, ypredFull, squared=False)

print("RMSE  ","%15s,%7.3f"%("PLS Full", rmse_full))
print("RMSE  ","%15s,%7.3f"%("PLS Super Set", rmse))
print("RMSE  ","%15s,%7.3f"%(models_results['Full'].bestinsidemethod_name_rmse, models_results["Full"].bestinsidemethod_rmse))
print("RMSE  ","%15s,%7.3f"%(models_results['Full'].bestourmethod_name_rmse, models_results["Full"].bestourmethod_rmse))

plt.clf()
plt.rcParams['figure.figsize'] = 10,10
fig, ax = plt.subplots()
ax.scatter(Y, y_pred_full, c='r', s=50, label='PLS Full model')
ax.scatter(Y, ypredFull, c='b', s=50, label='PLS Super Set model')
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.legend(loc="upper left")
plt.title("Full")
plt.show()


In [None]:
#test and dump PLS equations
setname = "Full"
pls_model_full = models_results[setname].plsmodel

for setname in fullsetnames:
    print("Equations for dataset: ", setname)
    ssetname = "Full"
    if setname in supersetnames or setname == "Full":
        ssetname = setname  
    else:    
        lastunder = setname.rfind("_")
        ssetname = setname[:lastunder]
    
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)
    
    y_pred_full = pls_model_full.predict(X)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)
    X_e = X.copy()
    X_e -= pls_model_full._x_mean
    X_e /= pls_model_full._x_std
    y_pred_full_e = np.dot(X_e, pls_model_full.coef_.T)
    y_pred_full_e += pls_model_full._y_mean
    rmse_full_e = mean_squared_error(Y, y_pred_full_e, squared=False)
    print("   Full dataset equations Y mean %7.3f"%pls_model_full._y_mean)
    for i, f in enumerate(features_names):
        print(" %50s %7.3f [%15.3f %15.3f]"%(f, \
            pls_model_full.coef_.T[i],
            pls_model_full._x_mean[i], 
            pls_model_full._x_std[i]))

    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)
    X_e = X.copy()
    X_e -= pls_model_ssetname._x_mean
    X_e /= pls_model_ssetname._x_std
    y_pred_e = np.dot(X_e, pls_model_ssetname.coef_.T)
    y_pred_e += pls_model_ssetname._y_mean
    rmse_e = mean_squared_error(Y, y_pred_e, squared=False)
    for i, f in enumerate(features_names):
        print(" %50s %7.3f [%15.3f %15.3f]"%(f, \
            pls_model_ssetname.coef_.T[i],
            pls_model_ssetname._x_mean[i], 
            pls_model_ssetname._x_std[i]))

    print()
    print("RMSE         (ssetname) %7.3f from eq. %7.3f diff []"%(rmse, rmse_e))
    print("RMSE             (Full) %7.3f from eq. %7.3f diff []"%(rmse_full, rmse_full_e))  
    print("RMSE (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_rmse)
    print("RMSE    (bestourmethod) %7.3f"%models_results[setname].bestourmethod_rmse)
    print()
