In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

from dataclasses import dataclass
import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy
import pickle

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
warnings.simplefilter("ignore")

howmanydifs = 3
allvalues_perset = pickle.load(open("./data/allvalues_perset.p", "rb"))
methods = pickle.load(open("./data/methods.p", "rb"))
fullsetnames = pickle.load(open("./data/fullsetnames.p", "rb"))
functionals = pickle.load(open("./data/functionals.p", "rb"))
basis_sets = pickle.load(open("./data/basis_sets.p", "rb"))
supersetnames = pickle.load(open("./data/supersetnames.p", "rb"))

In [3]:
from importlib import reload
reload(commonutils)

from commonutils import ModelResults

allfeatures = set()
for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)

# set labels and sets iists
models_results = {}
for setname in fullsetnames:
    models_results[setname] = ModelResults()
    for val in allvalues_perset[setname]:
        models_results[setname].labels.append(val["label"]) 
        models_results[setname].supersetnames.append(val["super_setname"])
        models_results[setname].setnames.append(val["super_setname"]+"_"+val["setname"])

for setname in fullsetnames:
    for methodid in range(howmanydifs):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])

        wtmad = None
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                    models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestinsidemethod_wtmad:
                models_results[setname].bestinsidemethod_wtmad = wtmad
                models_results[setname].bestinsidemethod_name_wtmad = str(methodid)
                models_results[setname].y_pred_bestinsidemethod_wtmad = y_pred

        rmse = mean_squared_error(models_results[setname].labels, \
                                y_pred, squared=False)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            models_results[setname].bestinsidemethod_name_rmse = str(methodid)
            models_results[setname].y_pred_bestinsidemethod_rmse = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])

        wtmad = None            
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestourmethod_wtmad:
                models_results[setname].bestourmethod_wtmad = wtmad
                models_results[setname].bestourmethod_name_wtmad = method
                models_results[setname].y_pred_bestourmethod_wtmad = y_pred
        
        rmse = mean_squared_error(models_results[setname].labels,\
                                y_pred, squared=False)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod_name_rmse = method
            models_results[setname].y_pred_bestourmethod_rmse = y_pred

bestmnethodscount = {}
setofbestourmethodswtamd = {}
for setname in fullsetnames:
    if models_results[setname].bestourmethod_name_rmse in bestmnethodscount:
        bestmnethodscount[models_results[setname].bestourmethod_name_rmse] += 1
    else:
        bestmnethodscount[models_results[setname].bestourmethod_name_rmse] = 1

    if models_results[setname].bestourmethod_name_wtmad != "":
        if models_results[setname].bestourmethod_name_wtmad in setofbestourmethodswtamd:
            setofbestourmethodswtamd[models_results[setname].bestourmethod_name_wtmad] += 1
        else:
            setofbestourmethodswtamd[models_results[setname].bestourmethod_name_wtmad] = 1

In [4]:
#build descriptors 
selected_functional = "PBE0"
selected_basisset = "TZVP"
functionals = ["PBE", "PBE0", "TPSS", "TPSSh"]
basis_sets = ['MINIX', 'SVP', 'TZVP']
for setname in fullsetnames:
    desciptors = {}
    for val in allvalues_perset[setname]:
        for func in functionals:
            for basis in basis_sets:
                if basis == selected_basisset and func == selected_functional:
                    k = func + "-" + basis + "_energydiff"
                    for k2 in val[k]:
                        if k2 not in desciptors:
                            desciptors[k2] = [val[k][k2]]
                        else:
                            desciptors[k2].append(val[k][k2])
                else:
                    refk  = selected_functional + "-" + selected_basisset + "_energydiff"
                    k = func + "-" + basis + "_energydiff"
                    for k2 in val[k]:
                        refk2 = k2.replace(basis, selected_basisset)
                        refk2 = refk2.replace(func, selected_functional)
                        newk2 = k2 + "_difftoref"
                        if newk2 not in desciptors:
                            desciptors[newk2] = [val[refk][refk2] - val[k][k2]]
                        else:
                            desciptors[newk2].append(val[refk][refk2] - val[k][k2])
    models_results[setname].features = desciptors
    #print("Descriptors for ", setname)
    #for k in desciptors:
    #    print(k, len(desciptors[k]), desciptors[k])

# feastures selection
setname = "Full"
numoffeat = len(models_results[setname].features)
#print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

toremove = []
setname = "Full"
for k in models_results[setname].features:
    if len(set(models_results[setname].features[k])) == 1:
        toremove.append(k)
        #print("Constant fatures to remove: ", k)

# remove constant values
for setname in fullsetnames:
    #print("Removing constant features for ", setname)
    for k in toremove:
        #print("Constant fatures to remove: ", k)
        del models_results[setname].features[k]


# force removing features Nuclear Repulsion difference
#print("Removing Nuclear Repulsion difference")
for setname in fullsetnames: 
    toremove = []
    for k in models_results[setname].features:
        if k.find("Nuclear_Repulsion_difftoref") != -1:
            toremove.append(k)
    for k in toremove:
        #print("Removing feature ", k)
        del models_results[setname].features[k]

setname = "Full"
numoffeat = len(models_results[setname].features)
#print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

In [5]:
setname = "Full"
print("Running PLS for dataset: ", setname)

X, Y, features_names = \
    commonutils.build_XY_matrix (models_results[setname].features, \
              models_results[setname].labels)
setlist = models_results[setname].setnames  
supersetlist = models_results[setname].supersetnames
maxcomp = X.shape[1]
ncomps, rmses, r2s, wtmads, loormses = \
          models.pls_model (X, Y, supersetlist, setlist, \
          ncomp_start = 1, ncomp_max = maxcomp-8, split = False,\
          plot = False, loo=False)
r2max_comps = np.argmax(r2s)+1
rmsemin_comps = np.argmin(rmses)+1
wtmadmin_comps = np.argmin(wtmads)+1
compstouse = min(r2max_comps, rmsemin_comps, wtmadmin_comps)
print("   Selected ", compstouse, " components")

Running PLS for dataset:  Full
   Selected  28  components


In [6]:
# perform features importance analysis
setname = "Full"   
print("Running PLS for dataset: ", setname)
print("  Using ", compstouse, " components")
X, Y, features_names = \
      commonutils.build_XY_matrix (models_results[setname].features, \
              models_results[setname].labels)
setlist = []
for i, s in enumerate(models_results[setname].setnames):
    ss = models_results[setname].supersetnames[i]
    setlist.append(ss + "_" + s)

plsmodel = PLSRegression(n_components=compstouse)
plsmodel.fit(X, Y)
y_pred = plsmodel.predict(X) 
   
cv = LeaveOneOut()
model = PLSRegression(n_components=compstouse)
scores = cross_val_score(model, X, Y, \
            scoring='neg_mean_squared_error', \
            cv=cv, n_jobs=-1)
loormse = np.sqrt(np.mean(np.absolute(scores)))
rmse = mean_squared_error(Y, y_pred, squared=False)
r2 = r2_score(Y, y_pred)
if len(y_pred.shape) == 2:
    y_pred = y_pred[:,0]
wtmadf = commonutils.wtmad2(setlist, Y, y_pred)
wtmad = wtmadf["Full"]

Running PLS for dataset:  Full
  Using  28  components


In [7]:
most_importante_features = []
result = permutation_importance(plsmodel, X, Y, n_repeats=10, \
                                random_state=42, n_jobs=2)
pfi_sorted_idx = result.importances_mean.argsort()
#compute absolute values of the PLS coefficients
coef = np.abs(plsmodel.coef_).flatten()
#sort the coefficients
sorted_idx = np.argsort(coef)

# print the most important features
for i in reversed(pfi_sorted_idx):
    most_importante_features.append(features_names[i])

In [8]:
# remove corralted features 
CORRCUT = 0.95

setname = "Full"
touse = set()
# add by default the selected FINAL_SINGLE_POINT_ENERGY
touse.add(selected_functional + "-" + \
            selected_basisset + "_" + \
            "FINAL_SINGLE_POINT_ENERGY")
toremove = set()
df = pd.DataFrame(models_results[setname].features)
corr = df.corr().abs()
for feat1 in most_importante_features:
    if feat1 not in toremove:
        touse.add(feat1)
        for idx, v in enumerate(corr[feat1]):
            if v > CORRCUT:
                feat2 = corr.columns[idx]
                if feat2 != feat1:
                    toremove.add(feat2)

z = touse.intersection(toremove) 
if len(z) != 0:
    print("Error in removing correlated features")
    sys.exit(1) 

for setname in fullsetnames:
    for k in touse:
        models_results[setname].uncorrelated_features[k] = \
            deepcopy(models_results[setname].features[k])

In [9]:
#compute VIF
df = pd.DataFrame(models_results["Full"].uncorrelated_features)
vif = pd.DataFrame()
#scale data before computing VIF
df = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
vif["features"] = df.columns
vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
# histogram of VIF
for v in vif.values:
    if v[1] > 160:
        print(v[0], v[1])
        for setname in fullsetnames:
            if v[0] in models_results[setname].uncorrelated_features:
                del models_results[setname].uncorrelated_features[v[0]]

TPSSh-SVP_E(X)_difftoref 1265.043643169283
TPSSh-SVP_FINAL_SINGLE_POINT_ENERGY_difftoref 1494.41602055861
PBE0-SVP_One_Electron_Energy_difftoref 176.08424813155955
PBE-SVP_FINAL_SINGLE_POINT_ENERGY_difftoref 2384.387962817485
TPSS-TZVP_FINAL_SINGLE_POINT_ENERGY_difftoref 25736.097281358096
PBE0-SVP_E(C)_difftoref 185.48033202671405
TPSSh-TZVP_E(C)_difftoref 298.47443475002046
PBE0-SVP_FINAL_SINGLE_POINT_ENERGY_difftoref 1372.3170682927
TPSSh-TZVP_FINAL_SINGLE_POINT_ENERGY_difftoref 65551.23739288974
TPSSh-TZVP_Kinetic_Energy_difftoref 215503.82837718667
TPSSh-TZVP_Potential_Energy_difftoref 194861.48846310092
TPSS-SVP_E(C)_difftoref 725.3564038197098
TPSSh-MINIX_E(C)_difftoref 281.85517204540946
PBE-SVP_E(X)_difftoref 1312.8939802365248
TPSSh-TZVP_E(X)_difftoref 451.0551846041091
PBE0-SVP_E(X)_difftoref 850.6146014679825
PBE0-MINIX_E(C)_difftoref 169.25599562818087
PBE0-TZVP_Dispersion_correction 1168.7725567357484
PBE-TZVP_FINAL_SINGLE_POINT_ENERGY_difftoref 4203.45420519307
PBE-TZVP_

In [10]:

comptuseperset = {}
for setname in list(supersetnames)+["Full"]:
    comptuseperset[setname] = 0

perc_split = 0.2
for setname in list(supersetnames)+["Full"]:
   print("Running PLS search for dataset: ", setname)

   X, Y, features_names = \
      commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
              models_results[setname].labels)
   setlist = models_results[setname].setnames
   supersetlist = models_results[setname].supersetnames
   maxcomp = X.shape[1]
   ncomps, rmses, r2s, wtmads, loormses = \
          models.pls_model (X, Y, supersetlist, setlist, \
          ncomp_start = 1, ncomp_max = maxcomp, split = False,\
          plot = False)
   r2max_comps = np.argmax(r2s)+1
   rmsemin_comps = np.argmin(rmses)+1
   wtmadmin_comps = np.argmin(wtmads)+1
   loormsemin_comps = np.argmin(loormses)+1

   compstouse = wtmadmin_comps
   comptuseperset[setname] = compstouse 

Running PLS search for dataset:  BARRIER_HEIGHTS
Running PLS search for dataset:  INTRAMOLECULAR_INTERACTIONS
Running PLS search for dataset:  SMALL_MOLECULES
Running PLS search for dataset:  INTERMOLECULAR_INTERACTIONS
Running PLS search for dataset:  LARGE_SYSTEMS
Running PLS search for dataset:  Full


In [11]:
for setname in list(supersetnames)+["Full"]:   
   print("Running PLS for dataset: ", setname)
   print("  Using ", comptuseperset[setname], " components")
   compstouse = comptuseperset[setname]
   X, Y, features_names = \
      commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
              models_results[setname].labels)
   setlist = models_results[setname].setnames
   models_results[setname].plsmodel = PLSRegression(n_components=compstouse)
   models_results[setname].plsmodel.fit(X, Y)
   models_results[setname].y_pred = \
      models_results[setname].plsmodel.predict(X) 
   
   cv = LeaveOneOut()
   model = PLSRegression(n_components=compstouse)
   scores = cross_val_score(model, X, Y, \
            scoring='neg_mean_squared_error', \
            cv=cv, n_jobs=-1)
   loormse = np.sqrt(np.mean(np.absolute(scores)))
   rmse = mean_squared_error(Y, models_results[setname].y_pred, squared=False)
   r2 = r2_score(Y, models_results[setname].y_pred)
   y_pred = models_results[setname].y_pred
   if len(y_pred.shape) == 2:
            y_pred = y_pred[:,0]
   wtmadf = commonutils.wtmad2(setlist, Y, y_pred)
   wtmad = wtmadf[setname]


Running PLS for dataset:  BARRIER_HEIGHTS
  Using  15  components
Running PLS for dataset:  INTRAMOLECULAR_INTERACTIONS
  Using  16  components
Running PLS for dataset:  SMALL_MOLECULES
  Using  17  components
Running PLS for dataset:  INTERMOLECULAR_INTERACTIONS
  Using  15  components
Running PLS for dataset:  LARGE_SYSTEMS
  Using  16  components
Running PLS for dataset:  Full
  Using  12  components


In [13]:


pls_model_full = models_results["Full"].plsmodel
ypredFull = []
setnamesFull = []
for ssetname in supersetnames:
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)
    setlist = models_results[setname].setnames
    
    y_pred_full = pls_model_full.predict(X)
    if len(y_pred_full.shape) == 2:
        y_pred_full = y_pred_full[:,0]

    y_pred = pls_model_ssetname.predict(X)
    if len(y_pred.shape) == 2:
        y_pred = y_pred[:,0]
    
    ypredFull.extend(list(y_pred))
    setnamesFull.extend(setlist)

    rmse = mean_squared_error(Y, y_pred, squared=False)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)

    wtmad2df = commonutils.wtmad2(setlist, Y, y_pred)
    wtmad2 = wtmad2df[setname]

    wtmad2_fulldf = commonutils.wtmad2(setlist, Y, y_pred_full)
    wtmad2_full = wtmad2_fulldf[setname]

    print("%30s , %5d , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f"%\
          (ssetname, len(Y), \
           wtmad2, wtmad2_full, \
           models_results[ssetname].bestinsidemethod_wtmad, \
           models_results[ssetname].bestourmethod_wtmad, \
           rmse, rmse_full, \
           models_results[ssetname].bestinsidemethod_rmse,
           models_results[ssetname].bestourmethod_rmse))
    
    print("Results for ", setname, " dim: ", len(Y))

    print("WTAMD2     (PLS ssetname) %7.3f"%wtmad2)
    print("WTAMD2         (PLS Full) %7.3f"%wtmad2_full)
    print("WTAMD2 (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_wtmad) 
    print("WTAMD2    (bestourmethod) %7.3f"%models_results[setname].bestourmethod_wtmad) 

    print("RMSE       (PLS ssetname) %7.3f"%rmse)
    print("RMSE           (PLS Full) %7.3f"%rmse_full,)
    print("RMSE   (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_rmse)
    print("RMSE      (bestourmethod) %7.3f"%models_results[setname].bestourmethod_rmse)
    

print("Results for Full sim ", len(ypredFull))
X, Y, features_names = \
        commonutils.build_XY_matrix (models_results["Full"].uncorrelated_features, \
                                    models_results["Full"].labels)
setlist = models_results["Full"].setnames
wtmad2df = commonutils.wtmad2(setnamesFull, Y, ypredFull)
wtmad2 = wtmad2df["Full"]
rmse = mean_squared_error(Y, ypredFull, squared=False)

y_pred_full = pls_model_full.predict(X)
if len(y_pred_full.shape) == 2:
    y_pred_full = y_pred_full[:,0]
rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
wtmad2_fulldf = commonutils.wtmad2(setlist, Y, y_pred_full)
wtmad2_full = wtmad2_fulldf["Full"]

print("WTAMD2     (PLS ssetname) %7.3f"%wtmad2)
print("WTAMD2         (PLS Full) %7.3f"%wtmad2_full)
print("WTAMD2 (bestinsidemethod) %7.3f"%models_results["Full"].bestinsidemethod_wtmad)
print("WTAMD2    (bestourmethod) %7.3f"%models_results["Full"].bestourmethod_wtmad)
rmse = mean_squared_error(models_results["Full"].labels, ypredFull, squared=False)
print("RMSE       (PLS ssetname) %7.3f"%rmse)
print("RMSE           (PLS Full) %7.3f"%rmse_full)
print("RMSE   (bestinsidemethod) %7.3f"%models_results["Full"].bestinsidemethod_rmse)
print("RMSE      (bestourmethod) %7.3f"%models_results["Full"].bestourmethod_rmse)

Results for  Full  dim:  1505
WTAMD2     (PLS ssetname)  54.460
WTAMD2         (PLS Full)   8.990
WTAMD2 (bestinsidemethod)  10.160
WTAMD2    (bestourmethod)   8.160
RMSE       (PLS ssetname)  16.085
RMSE           (PLS Full)   4.531
RMSE   (bestinsidemethod)   9.227
RMSE      (bestourmethod)   6.219
Results for  Full  dim:  1505
WTAMD2     (PLS ssetname)  53.470
WTAMD2         (PLS Full)   8.990
WTAMD2 (bestinsidemethod)  10.160
WTAMD2    (bestourmethod)   8.160
RMSE       (PLS ssetname)  14.900
RMSE           (PLS Full)   4.531
RMSE   (bestinsidemethod)   9.227
RMSE      (bestourmethod)   6.219
Results for  Full  dim:  1505
WTAMD2     (PLS ssetname)  23.640
WTAMD2         (PLS Full)   8.990
WTAMD2 (bestinsidemethod)  10.160
WTAMD2    (bestourmethod)   8.160
RMSE       (PLS ssetname)   5.961
RMSE           (PLS Full)   4.531
RMSE   (bestinsidemethod)   9.227
RMSE      (bestourmethod)   6.219
Results for  Full  dim:  1505
WTAMD2     (PLS ssetname)   9.240
WTAMD2         (PLS Full)   8.

AttributeError: 'NoneType' object has no attribute 'values'

In [None]:
#test and dump PLS equations
setname = "Full"
pls_model_full = models_results[setname].plsmodel

for setname in fullsetnames:
    print("Equations for dataset: ", setname)
    ssetname = "Full"
    if setname in supersetnames or setname == "Full":
        ssetname = setname  
    else:    
        lastunder = setname.rfind("_")
        ssetname = setname[:lastunder]
    
    pls_model_ssetname = models_results[ssetname].plsmodel
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)
    
    y_pred_full = pls_model_full.predict(X)
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    r2_full = r2_score(Y, y_pred_full)
    X_e = X.copy()
    X_e -= pls_model_full._x_mean
    X_e /= pls_model_full._x_std
    y_pred_full_e = np.dot(X_e, pls_model_full.coef_.T)
    y_pred_full_e += pls_model_full._y_mean
    rmse_full_e = mean_squared_error(Y, y_pred_full_e, squared=False)
    print("   Full dataset equations Y mean %7.3f"%pls_model_full._y_mean)
    for i, f in enumerate(features_names):
        print(" %50s %10.3f [%15.3f %15.3f]"%(f, \
            pls_model_full.coef_.T[i],
            pls_model_full._x_mean[i], 
            pls_model_full._x_std[i]))

    y_pred = pls_model_ssetname.predict(X)
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)
    X_e = X.copy()
    X_e -= pls_model_ssetname._x_mean
    X_e /= pls_model_ssetname._x_std
    y_pred_e = np.dot(X_e, pls_model_ssetname.coef_.T)
    y_pred_e += pls_model_ssetname._y_mean
    rmse_e = mean_squared_error(Y, y_pred_e, squared=False)
    for i, f in enumerate(features_names):
        print(" %50s %10.3f [%15.3f %15.3f]"%(f, \
            pls_model_ssetname.coef_.T[i],
            pls_model_ssetname._x_mean[i], 
            pls_model_ssetname._x_std[i]))

    print()
    print("RMSE         (ssetname) %7.3f from eq. %7.3f diff []"%(rmse, rmse_e))
    print("RMSE             (Full) %7.3f from eq. %7.3f diff []"%(rmse_full, rmse_full_e))  
    print("RMSE (bestinsidemethod) %7.3f"%models_results[setname].bestinsidemethod_rmse)
    print("RMSE    (bestourmethod) %7.3f"%models_results[setname].bestourmethod_rmse)
    print()


In [None]:
# perform features importance analysis
for setname in list(supersetnames)+["Full"]:   

    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                                    models_results[setname].labels)
    model = models_results[setname].plsmodel
    result = permutation_importance(model, X, Y, n_repeats=10, \
                                random_state=42, n_jobs=2)
    pfi_sorted_idx = result.importances_mean.argsort()
    plt.clf()
    plt.rcParams['figure.figsize'] = 15,15
    fig, ax = plt.subplots()
    ax.boxplot(result.importances[pfi_sorted_idx].T, vert=False, \
               labels=np.array(features_names)[pfi_sorted_idx])
    ax.set_title("Permutation Importances " + setname)
    fig.tight_layout()
    plt.show()
    
    #compute absolute values of the PLS coefficients
    coef = np.abs(model.coef_).flatten()
    #sort the coefficients
    sorted_idx = np.argsort(coef)
    plt.clf()
    plt.rcParams['figure.figsize'] = 15,15
    fig, ax = plt.subplots()
    ax.barh(np.array(features_names)[sorted_idx], \
            coef[sorted_idx])
    ax.set_title("PLS coefficients " + setname)
    fig.tight_layout()
    plt.show()

    # scatter plot of the most important features
    plt.clf()
    plt.rcParams['figure.figsize'] = 15,15
    fis = [np.mean(result.importances[i].T) for i in pfi_sorted_idx]
    cfs = [coef[i] for i in sorted_idx]
    plt.plot(cfs, fis, '-o', color='black')
    plt.xlabel("PLS coefficients")
    plt.ylabel("Permutation importances")
    plt.title("Most important features " + setname)
    plt.show()
