In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models

from commonutils import ModelResults

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score

import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy

In [None]:
warnings.simplefilter("ignore")

supersetnames = {"BARRIER_HEIGHTS" : \
                       ["BH76","BHDIV10","BHPERI",\
                        "BHROT27","INV24","PX13","WCPT18"] \
                    ,"INTRAMOLECULAR_INTERACTIONS" : \
                       ["ACONF",'Amino20x4',"BUT14DIOL",\
                        "ICONF","IDISP","MCONF",\
                        "PCONF21","SCONF","UPU23"] , \
                    "SMALL_MOLECULES" :\
                        ["AL2X6","ALK8","ALKBDE10","BH76RC",\
                         "DC13","DIPCS10","FH51","G21EA",\
                         "G21IP","G2RC","HEAVYSB11","NBPRC",\
                         "PA26","RC21","SIE4x4","TAUT15",\
                         "W4-11","YBDE18"], \
                    "INTERMOLECULAR_INTERACTIONS" :\
                       ["ADIM6","AHB21","CARBHB12",\
                        "CHB6","HAL59","HEAVY28","IL16",\
                        "PNICO23","RG18","S22","S66","WATER27"] , \
                    "LARGE_SYSTEMS" :\
                        ["BSR36","C60ISO","CDIE20","DARC",\
                         "ISO34","ISOL24","MB16-43","PArel",\
                            "RSE43"]}    

howmanydifs = 3
methods = {"PBE-TZVP" : ["Nuclear Repulsion  :", \
                    "One Electron Energy:", \
                    "Two Electron Energy:", \
                    "Potential Energy   :", \
                    "Kinetic Energy     :", \
                    "E(X)               :"  , \
                    "E(C)               :"  , \
                    "Dispersion correction", \
                    "FINAL SINGLE POINT ENERGY"], 
            "PBE-SVP" : ["Nuclear Repulsion  :", \
                    "One Electron Energy:", \
                    "Two Electron Energy:", \
                    "Potential Energy   :", \
                    "Kinetic Energy     :", \
                    "E(X)               :"  , \
                    "E(C)               :"  , \
                    "Dispersion correction", \
                    "FINAL SINGLE POINT ENERGY"], 
            "PBE-MINIX" : ["Nuclear Repulsion  :", \
                    "One Electron Energy:", \
                    "Two Electron Energy:", \
                    "Potential Energy   :", \
                    "Kinetic Energy     :", \
                    "E(X)               :"  , \
                    "E(C)               :"  , \
                    "Dispersion correction", \
                    "FINAL SINGLE POINT ENERGY"], 
            "PBE0-TZVP" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "PBE0-SVP" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "PBE0-MINIX" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "TPSS-TZVP" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "TPSS-SVP" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "TPSS-MINIX" : ["Nuclear Repulsion  :", \
                      "One Electron Energy:", \
                      "Two Electron Energy:", \
                      "Potential Energy   :", \
                      "Kinetic Energy     :", \
                      "E(X)               :"  , \
                      "E(C)               :"  , \
                      "Dispersion correction", \
                      "FINAL SINGLE POINT ENERGY"],
            "TPSSh-TZVP" : ["Nuclear Repulsion  :", \
                       "One Electron Energy:", \
                       "Two Electron Energy:", \
                       "Potential Energy   :", \
                       "Kinetic Energy     :", \
                       "E(X)               :"  , \
                       "E(C)               :"  , \
                       "Dispersion correction", \
                       "FINAL SINGLE POINT ENERGY"],
            "TPSSh-SVP" : ["Nuclear Repulsion  :", \
                       "One Electron Energy:", \
                       "Two Electron Energy:", \
                       "Potential Energy   :", \
                       "Kinetic Energy     :", \
                       "E(X)               :"  , \
                       "E(C)               :"  , \
                       "Dispersion correction", \
                       "FINAL SINGLE POINT ENERGY"],
            "TPSSh-MINIX" : ["Nuclear Repulsion  :", \
                       "One Electron Energy:", \
                       "Two Electron Energy:", \
                       "Potential Energy   :", \
                       "Kinetic Energy     :", \
                       "E(X)               :"  , \
                       "E(C)               :"  , \
                       "Dispersion correction", \
                       "FINAL SINGLE POINT ENERGY"]
                }

# read all the data and initialize the data structures
rootdir = "../datasets/AllData_3/"   
allvalues_perset, fullsetnames, models_results = \
        commonutils.read_and_init (rootdir, supersetnames, howmanydifs, methods, \
                       DEBUG=True)

In [None]:
# for debug purposes
#for val in allvalues_perset:
#    print("======= START =======")
#    print(val, len(allvalues_perset[val]))
#    pp.pprint(allvalues_perset[val])
#    print("=======  END  =======")

In [None]:
allfeatures = set()
includeFull = True
metricsets = ["SMALL_MOLECULES","LARGE_SYSTEMS","BARRIER_HEIGHTS",\
              "INTERMOLECULAR_INTERACTIONS","INTRAMOLECULAR_INTERACTIONS"]

for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)

for setname in fullsetnames:
    models_results[setname].inside_methods_rmse = []
    models_results[setname].inside_methods_r2 = []
    models_results[setname].our_methods_rmse = {}
    models_results[setname].our_methods_r2 = {}
    
    models_results[setname].bestinsidemethod_rmse = float("inf")
    models_results[setname].bestinsidemethod = ""
    models_results[setname].bestourmethod_rmse = float("inf")
    models_results[setname].bestourmethod = ""
    models_results[setname].our_methods_name = []

    models_results[setname].bestinsidemethod_wtamd = float("inf")
    models_results[setname].bestourmethod_wtamd = float("inf")

    for methodid in range(howmanydifs):
        y_pred = []
        labels = []
        supersetlist = []
        setlist = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])
            labels.append(val["label"])
            supersetlist.append(val["super_setname"])
            setlist.append(val["super_setname"]+"_"+val["setname"])

        wtmad = commonutils.wtmad_calc(supersetlist, setlist, y_pred, \
                                labels, includeFull = True)
        if wtmad["Full"] < models_results[setname].bestinsidemethod_wtamd:
            models_results[setname].bestinsidemethod_wtamd = wtmad["Full"]

        r2 = r2_score(labels, y_pred)
        rmse = mean_squared_error(labels, y_pred, squared=False)
        models_results[setname].inside_methods_rmse.append(rmse)
        models_results[setname].inside_methods_r2.append(r2)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            models_results[setname].bestinsidemethod = str(methodid)
            models_results[setname].y_pred_bestinsidemethod = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        labels = []
        supersetlist = []
        setlist = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])
            labels.append(val["label"])
            supersetlist.append(val["super_setname"])
            setlist.append(val["super_setname"]+"_"+val["setname"])
            
        wtmad = commonutils.wtmad_calc(supersetlist, setlist, y_pred, \
                                      labels, includeFull = True)
        if wtmad["Full"] < models_results[setname].bestourmethod_wtamd:
            models_results[setname].bestourmethod_wtamd = wtmad["Full"]
        
        r2 = r2_score(labels, y_pred)
        rmse = mean_squared_error(labels, y_pred, squared=False)

        models_results[setname].our_methods_rmse[method] = rmse
        models_results[setname].our_methods_r2[method] = r2
        models_results[setname].our_methods_name.append(method)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod = method
            models_results[setname].y_pred_bestourmethod = y_pred

######################################################################################
# WTMAD2 Calculation for reference methods (wtmad_ref):

wtmad2_ref = commonutils.wtmad_ref(fullsetnames, metricsets,\
        howmanydifs,allvalues_perset, includeFull)

print("WTMAD-2 for inside methods", "\n")
print(wtmad2_ref, '\n')

# WTMAD2 Calculation for our methods (wtmad):

wtmad2 = commonutils.wtmad(fullsetnames, metricsets, methods, \
        allvalues_perset, includeFull)

print("WTMAD-2 for our methods", "\n")
print(wtmad2, "\n")

######################################################################################

print("Results for inside and our methods")
print("%40s"% "Dataset", " , ", \
      "Best inside method", " , ", \
      "RMSE", " , ", \
      "Best our method", " , ", \
      "RMSE")
for setname in fullsetnames:
    print("%40s"%setname, " , ", \
        models_results[setname].bestinsidemethod , " , ",\
        "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
        "%5s"%models_results[setname].bestourmethod , " , ", \
        "%7.3f"%models_results[setname].bestourmethod_rmse)


In [None]:
# test the new def wtmad_calc(supersetlist, setlist, predicted, labels, includeFull = True):
"""
import importlib
importlib.reload(commonutils)

supersetlist = []
setlist = []
predicted = []
labels = []
for setname in allvalues_perset:
    if setname not in list(supersetnames.keys()) + ["Full"]:
        for val in allvalues_perset[setname]:
            predicted.append(val["PBE-MINIX_energydiff"]["PBE-MINIX_FINAL_SINGLE_POINT_ENERGY"])
            labels.append(val["label"])
            setlist.append(setname)
            pos = setname.rfind("_")
            supersetlist.append(setname[:pos])

wtmad = commonutils.wtmad_calc(supersetlist, setlist, predicted, \
                            labels, includeFull = True)
print("WTMAD for PBE-MINIX method", "\n")
pp.pprint(wtmad)

for supersetname in supersetnames:
    supersetlist = []
    setlist = []
    predicted = []
    labels = []

    for setname in allvalues_perset:
        if setname.startswith(supersetname):
            if setname not in list(supersetnames.keys()) + ["Full"]:
                for val in allvalues_perset[setname]:
                    predicted.append(val["PBE-MINIX_energydiff"]["PBE-MINIX_FINAL_SINGLE_POINT_ENERGY"])
                    labels.append(val["label"])
                    setlist.append(setname)
                    supersetlist.append(supersetname)
        
    wtmad = commonutils.wtmad_calc(supersetlist, setlist, predicted, \
                                   labels, includeFull = True)
    print("WTMAD for PBE-MINIX method in ", supersetname, "\n")
    pp.pprint(wtmad)
"""

In [None]:
CORRCUT = 0.95

setname = "Full"
models_results[setname].fulldescriptors = []
models_results[setname].labels = []
models_results[setname].setname = []
models_results[setname].supersetname = []
for idx, val in enumerate(allvalues_perset[setname]):
    models_results[setname].fulldescriptors.append({})
    for method in methods:
        models_results[setname].fulldescriptors[idx].update(val[method+"_energydiff"])

    models_results[setname].labels.append(val["label"])
    models_results[setname].setname.append(val["setname"])
    models_results[setname].supersetname.append(val["super_setname"])

X, Y, features_names = \
    commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                 models_results[setname].labels)

df = pd.DataFrame(X, columns=features_names)

corr = df.corr().abs()
top_correlation = {}
for k in corr.columns:
    top_correlation[k] = []
    print(k, " ")
    for idx, v in enumerate(corr[k]):
        if v > CORRCUT and corr.index[idx] != k:
            top_correlation[k].append((corr.index[idx], v))
            print(" %40s %4.2f"%(corr.index[idx], v))

In [None]:
featurestorms = set()
for tc in top_correlation:
    if tc not in featurestorms:
        for correlated in top_correlation[tc]:
            featurestorms.add(correlated[0])

print ("Features that are correlated with others and to remove")  
for idx, k in enumerate(featurestorms):
    print(idx+1 , " ", k)

In [None]:
print ("Features TO USE")
idx = 1
listoffeatures_used = []
for f in allfeatures:
    if f not in featurestorms:
        print(idx, f)
        listoffeatures_used.append(f) 
        idx += 1 

In [None]:
featurestorm = list(featurestorms)

for setname in fullsetnames:
    commonutils.remove_features_fromset(allvalues_perset[setname], \
                                            list(featurestorm), \
                                            methods)

for setname in fullsetnames:
    models_results[setname].fulldescriptors = []
    models_results[setname].labels = []
    models_results[setname].setname = []
    models_results[setname].supersetname = []
    for idx, val in enumerate(allvalues_perset[setname]):
        models_results[setname].fulldescriptors.append({})
        for method in methods:
            if method+"_energydiff" in val:
                models_results[setname].fulldescriptors[idx].update(\
                    val[method+"_energydiff"])

        models_results[setname].labels.append(val["label"])
        models_results[setname].setname.append(val["setname"])
        models_results[setname].supersetname.append(val["super_setname"])

setname = "Full"
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                 models_results[setname].labels)

df = pd.DataFrame(X, columns=features_names)

In [None]:
# CHECK WTAAD2 Calculation for reference methods (wtaad2_ref):
for setname in models_results:
    if setname in supersetnames:
        predicted = models_results[setname].y_pred_bestourmethod
        labels = models_results[setname].labels
        setlist = []
        for i, s in enumerate(models_results[setname].setname):
            ss = models_results[setname].supersetname[i]
            setlist.append(ss + "_" + s)   
        supersetlist = models_results[setname].supersetname

        wtmad = commonutils.wtmad_calc(supersetlist, setlist, predicted, labels, includeFull = False)
        print("WTMAD for ", setname, " and model ", \
              models_results[setname].bestourmethod, "\n")
        pp.pprint(wtmad)
        print(setname)

In [None]:
import seaborn as sns
%matplotlib inline
print("Correlation matrix")
plt.rcParams['figure.figsize'] = 10,10
sns.set(font_scale=2)
sns.heatmap(df.corr().abs(), annot=True)
#print(df.corr().abs())
#sns.heatmap(df, annot=True)

In [None]:
import importlib
importlib.reload(models)

perc_split = 0.2
for setname in list(supersetnames)+["Full"]:
   print("Running PLS for dataset: ", setname)

   X, Y, features_names = \
      commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
              models_results[setname].labels)
   setlist = []
   for i, s in enumerate(models_results[setname].setname):
        ss = models_results[setname].supersetname[i]
        setlist.append(ss + "_" + s)   
   supersetlist = models_results[setname].supersetname
   maxcomp = X.shape[1]
   ncomps, rmses, r2s, wtmads = \
          models.pls_model (perc_split, X, Y, supersetlist, setlist, \
          ncomp_start = 1, ncomp_max = maxcomp, split = False)
   r2max_comps = np.argmax(r2s)+1
   rmsemin_comps = np.argmin(rmses)+1
   wtmadmin_comps = np.argmin(wtmads)+1
   print("Best number of components for R2: ", r2max_comps)
   print("Best number of components for RMSE: ", rmsemin_comps)
   print("Best number of components for WTMAD: ", wtmadmin_comps)
   compstouse = wtmadmin_comps
   rmse_train, rmse_test, r2_train, \
      r2_test, rmse_full, r2_full , \
      models_results[setname].plsmodel, \
      X_train, X_test, y_train, y_test  = \
              models.pls_model (perc_split, X, Y, supersetlist, setlist, \
                                False, compstouse, split = False)
   
   models_results[setname].y_pred = \
      models_results[setname].plsmodel.predict(X) 

In [None]:
print(" Dim , %40s"% "Dataset", " , ", \
      "Best inside method RMSE", " , ", \
      "Best our method RMSE", " , ", \
      "RMSE (superset) ," + \
      "RMSE (Full)")
pls_model_full = models_results["Full"].plsmodel
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results["Full"].fulldescriptors, \
                                    models_results["Full"].labels)
y_pred = pls_model_full.predict(X)
rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)
print("%4d , %40s"%(len(models_results["Full"].labels), "Full"), " , ", \
    "%7.3f"%models_results["Full"].bestinsidemethod_rmse, " , ", \
    "%7.3f"%models_results["Full"].bestourmethod_rmse, " , ", \
    "%7.3f"%rmse, " , ", \
    "%7.3f"%rmse)

for ssetname in supersetnames:
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[ssetname].fulldescriptors, \
                                    models_results[ssetname].labels)
    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)    

    y_pred_full = pls_model_full.predict(X) 
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)

    print("%4d , %40s"%(len(models_results[ssetname].labels), ssetname), " , ", \
        "%7.3f"%models_results[ssetname].bestinsidemethod_rmse, " , ", \
        "%7.3f"%models_results[ssetname].bestourmethod_rmse, " , ", \
        "%7.3f"%rmse, " , ", \
        "%7.3f"%rmse_full)
    
    for isetname in supersetnames[ssetname]:
        setname = ssetname + "_" + isetname 
        X, Y, features_names = \
            commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                    models_results[setname].labels)

        y_pred_ssetname = pls_model_ssetname.predict(X)
        rmse_ssetname = mean_squared_error(Y, y_pred_ssetname, squared=False)
        r2_ssetname = r2_score(Y, y_pred_ssetname)

        y_pred_full = pls_model_full.predict(X)
        rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
        r2_full = r2_score(Y, y_pred_full)

        print("%4d , %40s"%(len(models_results[setname].labels), setname), " , ", \
            "%7.3f"%models_results[setname].bestinsidemethod_rmse, " , ", \
            "%7.3f"%models_results[setname].bestourmethod_rmse, " , ", \
            "%7.3f"%rmse_ssetname, " , ", \
            "%7.3f"%rmse_full)


In [None]:
# test scaling Y FOR DEBUG PURPOSES
"""
X, Y, features_names = \
    commonutils.build_XY_matrix (models_results["Full"].fulldescriptors, \
                                    models_results["Full"].labels)
scalery = preprocessing.StandardScaler().fit(Y.reshape(-1, 1))   
Y_s = scalery.transform(Y.reshape(-1, 1))
scalerx = preprocessing.StandardScaler().fit(X)
X_s = scalerx.transform(X)
  
perc_split = 0.2
maxcomp = X_s.shape[1]

ncomps, rmses_test, rmses_train, r2s_test, r2s_train = \
              models.pls_model (perc_split, X_s, Y_s, \
              ncomp_start = 1, ncomp_max = maxcomp)
r2max_comps = np.argmax(r2s_test)+1
rmsemin_comps = np.argmin(rmses_test)+1
compstouse = min(rmsemin_comps, r2max_comps)
rmse_train, rmse_test, r2_train, \
    r2_test, rmse_full, r2_full , \
    plsmodel, \
    X_train, X_test, y_train, y_test  = \
        models.pls_model (perc_split, X_s, Y_s, False, compstouse)
       
y_pred_s = plsmodel.predict(X_s)
y_pred = scalery.inverse_transform(y_pred_s)

rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)

print("Scaling Y")
print("RMSE %7.3f"%rmse, " , ", "R2 %7.3f"%r2)  
"""

In [None]:
setname = "Full"
pls_model_full = models_results[setname].plsmodel
printonlysuperset = True
for setname in fullsetnames:
    ssetname = "Full"
    if setname in supersetnames or setname == "Full":
        ssetname = setname  
    else:    
        lastunder = setname.rfind("_")
        ssetname = setname[:lastunder]
    
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                    models_results[setname].labels)
    setlist = []
    for i, s in enumerate(models_results[setname].setname):
        ss = models_results[setname].supersetname[i]
        setlist.append(ss + "_" + s)   
    supersetlist = models_results[setname].supersetname
    
    y_pred_full = pls_model_full.predict(X)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)

    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)

    if setname in supersetnames or setname == "Full":
        wtmad2 = commonutils.wtmad_calc(supersetlist, setlist, y_pred, \
                                      Y, includeFull = True)
        print("WTAMD2                    %7.3f"%wtmad2["Full"])
        print("WTAMD2 (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_wtamd) 
        print("WTAMD2    (bestourmethod) %7.3f"%models_results[setname].bestourmethod_wtamd)

    if printonlysuperset and setname not in list(supersetnames.keys()) + ["Full"]:
        continue

    print("RMSE           (ssetname) %7.3f"%rmse)
    print("RMSE               (Full) %7.3f"%rmse_full,)
    print("RMSE   (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_rmse)
    print("RMSE      (bestourmethod) %7.3f"%models_results[setname].bestourmethod_rmse)
    
    plt.clf()
    fig, ax = plt.subplots()
    ax.scatter(Y, y_pred, c='b', s=50, label='PLS ssetname model')
    #ax.scatter(Y, y_pred_full, c='y', s=50, label='PLS full model')
    #ax.scatter(Y, models_results[setname].y_pred_bestinsidemethod, \
    #            c='r', s=50, label='Best inside method')
    ax.scatter(Y, models_results[setname].y_pred_bestourmethod, \
               c='g', s=50, label='Best our method')
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.legend(loc="upper left")
    plt.title(setname)
    plt.show()


In [None]:
#test and dump PLS equations
setname = "Full"
pls_model_full = models_results[setname].plsmodel

for setname in fullsetnames:
    print("Equations for dataset: ", setname)
    ssetname = "Full"
    if setname in supersetnames or setname == "Full":
        ssetname = setname  
    else:    
        lastunder = setname.rfind("_")
        ssetname = setname[:lastunder]
    
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].fulldescriptors, \
                                    models_results[setname].labels)
    
    y_pred_full = pls_model_full.predict(X)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)
    X_e = X.copy()
    X_e -= pls_model_full._x_mean
    X_e /= pls_model_full._x_std
    y_pred_full_e = np.dot(X_e, pls_model_full.coef_)
    y_pred_full_e += pls_model_full._y_mean
    rmse_full_e = mean_squared_error(Y, y_pred_full_e, squared=False)
    print("   Full dataset equations Y mean %7.3f"%pls_model_full._y_mean)
    for i, f in enumerate(listoffeatures_used):
        print(" %30s %7.3f [%15.3f %15.3f]"%(f, \
            pls_model_full.coef_[i],
            pls_model_full._x_mean[i], 
            pls_model_full._x_std[i]))

    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)
    X_e = X.copy()
    X_e -= pls_model_ssetname._x_mean
    X_e /= pls_model_ssetname._x_std
    y_pred_e = np.dot(X_e, pls_model_ssetname.coef_)
    y_pred_e += pls_model_ssetname._y_mean
    rmse_e = mean_squared_error(Y, y_pred_e, squared=False)
    print("   Dataset equations Y mean %7.3f"%pls_model_ssetname._y_mean)
    for i, f in enumerate(listoffeatures_used):
        print(" %30s %7.3f [%15.3f %15.3f]"%(f, \
            pls_model_ssetname.coef_[i],
            pls_model_ssetname._x_mean[i], 
            pls_model_ssetname._x_std[i]))

    print()
    print("RMSE         (ssetname) %7.3f from eq. %7.3f diff []"%(rmse, rmse_e))
    print("RMSE             (Full) %7.3f from eq. %7.3f diff []"%(rmse_full, rmse_full_e))  
    print("RMSE (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_rmse)
    print("RMSE    (bestourmethod) %7.3f"%models_results[setname].bestourmethod_rmse)
    print()
