In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

from dataclasses import dataclass
import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy
import pickle

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

from importlib import reload
from commonutils import ModelResults

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
warnings.simplefilter("ignore")

howmanydifs = 3
allvalues_perset = pickle.load(open("./data/allvalues_perset.p", "rb"))
methods = pickle.load(open("./data/methods.p", "rb"))
fullsetnames = pickle.load(open("./data/fullsetnames.p", "rb"))
functionals = pickle.load(open("./data/functionals.p", "rb"))
basis_sets = pickle.load(open("./data/basis_sets.p", "rb"))
supersetnames = pickle.load(open("./data/supersetnames.p", "rb"))

In [None]:
reload(commonutils)


allfeatures = set()
for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)

# set labels and sets iists
models_results = {}
for setname in fullsetnames:
    models_results[setname] = ModelResults()
    for val in allvalues_perset[setname]:
        models_results[setname].labels.append(val["label"]) 
        models_results[setname].supersetnames.append(val["super_setname"])
        models_results[setname].setnames.append(val["super_setname"]+"_"+val["setname"])

for setname in fullsetnames:
    for methodid in range(howmanydifs):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])

        wtmad = None
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                    models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestinsidemethod_wtmad:
                models_results[setname].bestinsidemethod_wtmad = wtmad
                models_results[setname].bestinsidemethod_name_wtmad = str(methodid)
                models_results[setname].y_pred_bestinsidemethod_wtmad = y_pred

        rmse = mean_squared_error(models_results[setname].labels, \
                                y_pred, squared=False)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            models_results[setname].bestinsidemethod_name_rmse = str(methodid)
            models_results[setname].y_pred_bestinsidemethod_rmse = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])

        wtmad = None            
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestourmethod_wtmad:
                models_results[setname].bestourmethod_wtmad = wtmad
                models_results[setname].bestourmethod_name_wtmad = method
                models_results[setname].y_pred_bestourmethod_wtmad = y_pred
        
        rmse = mean_squared_error(models_results[setname].labels,\
                                y_pred, squared=False)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod_name_rmse = method
            models_results[setname].y_pred_bestourmethod_rmse = y_pred

bestmnethodscount = {}
setofbestourmethodswtamd = {}
for setname in fullsetnames:
    if models_results[setname].bestourmethod_name_rmse in bestmnethodscount:
        bestmnethodscount[models_results[setname].bestourmethod_name_rmse] += 1
    else:
        bestmnethodscount[models_results[setname].bestourmethod_name_rmse] = 1

    if models_results[setname].bestourmethod_name_wtmad != "":
        if models_results[setname].bestourmethod_name_wtmad in setofbestourmethodswtamd:
            setofbestourmethodswtamd[models_results[setname].bestourmethod_name_wtmad] += 1
        else:
            setofbestourmethodswtamd[models_results[setname].bestourmethod_name_wtmad] = 1

In [None]:

def mainfunc (mainmodelsID, allfeatures, CORRCUT, \
            selected_functional, selected_basisset, \
            functionals, basis_sets, fullsetnames, \
            allvalues_perset, models_results, supersetnames):

    for setname in fullsetnames:
        desciptors = {}
        for val in allvalues_perset[setname]:
            k = selected_functional + "-" + \
                selected_basisset + "_energydiff"
            for k2 in val[k]:
                if k2 not in desciptors:
                    desciptors[k2] = [val[k][k2]]
                else:
                    desciptors[k2].append(val[k][k2])
    
        for val in allvalues_perset[setname]:
            for func in functionals:
                for basis in basis_sets:
                    if not(basis == selected_basisset and \
                           func == selected_functional):
                        refk  = selected_functional + "-" + selected_basisset + "_energydiff"
                        k = func + "-" + basis + "_energydiff"
                        for k2 in val[k]:
                            refk2 = k2.replace(basis, selected_basisset)
                            refk2 = refk2.replace(func, selected_functional)
                            newk2 = k2 + "_difftoref"
                            if newk2 not in desciptors:
                                desciptors[newk2] = [val[refk][refk2] - val[k][k2]]
                            else:
                                desciptors[newk2].append(val[refk][refk2] - val[k][k2])
        
        models_results[setname].features = desciptors
    
    # feastures selection
    setname = "Full"
    numoffeat = len(models_results[setname].features)
    #print("Number of features for ", numoffeat)
    for setname in fullsetnames:
        if len(models_results[setname].features) != numoffeat:
            print("Number of features for ", setname, " is different")
            sys.exit(1)
    
    toremove = []
    setname = "Full"
    for k in models_results[setname].features:
        if len(set(models_results[setname].features[k])) == 1:
            toremove.append(k)
            #print("Constant fatures to remove: ", k)
    
    # remove constant values
    for setname in fullsetnames:
        #print("Removing constant features for ", setname)
        for k in toremove:
            #print("Constant fatures to remove: ", k)
            del models_results[setname].features[k]
    
    
    # force removing features Nuclear Repulsion difference
    #print("Removing Nuclear Repulsion difference")
    for setname in fullsetnames: 
        toremove = []
        for k in models_results[setname].features:
            if k.find("Nuclear_Repulsion_difftoref") != -1:
                toremove.append(k)
        for k in toremove:
            #print("Removing feature ", k)
            del models_results[setname].features[k]
    
    setname = "Full"
    numoffeat = len(models_results[setname].features)
    #print("Number of features for ", numoffeat)
    for setname in fullsetnames:
        if len(models_results[setname].features) != numoffeat:
            print("Number of features for ", setname, " is different")
            sys.exit(1)
    
    setname = "Full"
    #print("Running PLS for dataset: ", setname)
    
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].features, \
                  models_results[setname].labels)
    setlist = models_results[setname].setnames  
    supersetlist = models_results[setname].supersetnames
    maxcomp = X.shape[1]
    ncomps, rmses, r2s, wtmads, loormses = \
              models.pls_model (X, Y, supersetlist, setlist, \
              ncomp_start = 1, ncomp_max = maxcomp-8, split = False,\
              plot = False, loo=False)
    r2max_comps = np.argmax(r2s)+1
    rmsemin_comps = np.argmin(rmses)+1
    wtmadmin_comps = np.argmin(wtmads)+1
    compstouse = min(r2max_comps, rmsemin_comps, wtmadmin_comps)
    #print("   Selected ", compstouse, " components")
    
    # perform features importance analysis
    setname = "Full"   
    #print("Running PLS for dataset: ", setname)
    #print("  Using ", compstouse, " components")
    X, Y, features_names = \
          commonutils.build_XY_matrix (models_results[setname].features, \
                  models_results[setname].labels)
    setlist = []
    for i, s in enumerate(models_results[setname].setnames):
        ss = models_results[setname].supersetnames[i]
        setlist.append(ss + "_" + s)
    
    plsmodel = PLSRegression(n_components=compstouse)
    plsmodel.fit(X, Y)
    y_pred = plsmodel.predict(X) 
       
    cv = LeaveOneOut()
    model = PLSRegression(n_components=compstouse)
    scores = cross_val_score(model, X, Y, \
                scoring='neg_mean_squared_error', \
                cv=cv, n_jobs=-1)
    loormse = np.sqrt(np.mean(np.absolute(scores)))
    rmse = mean_squared_error(Y, y_pred, squared=False)
    r2 = r2_score(Y, y_pred)
    if len(y_pred.shape) == 2:
        y_pred = y_pred[:,0]
    wtmadf = commonutils.wtmad2(setlist, Y, y_pred)
    wtmad = wtmadf["Full"]
    
    most_importante_features = []
    result = permutation_importance(plsmodel, X, Y, n_repeats=10, \
                                    random_state=42, n_jobs=2)
    pfi_sorted_idx = result.importances_mean.argsort()
    #compute absolute values of the PLS coefficients
    coef = np.abs(plsmodel.coef_).flatten()
    #sort the coefficients
    sorted_idx = np.argsort(coef)
    
    # print the most important features
    for i in reversed(pfi_sorted_idx):
        most_importante_features.append(features_names[i])
    
    setname = "Full"
    touse = set()
    # add by default the selected FINAL_SINGLE_POINT_ENERGY
    touse.add(selected_functional + "-" + \
                selected_basisset + "_" + \
                "FINAL_SINGLE_POINT_ENERGY")
    toremove = set()
    df = pd.DataFrame(models_results[setname].features)
    corr = df.corr().abs()
    for feat1 in most_importante_features:
        if feat1 not in toremove:
            touse.add(feat1)
            for idx, v in enumerate(corr[feat1]):
                if v > CORRCUT:
                    feat2 = corr.columns[idx]
                    if feat2 != feat1:
                        toremove.add(feat2)
    
    z = touse.intersection(toremove) 
    if len(z) != 0:
        print("Error in removing correlated features")
        print(z)
        sys.exit(1) 
    
    for setname in fullsetnames:
        for k in touse:
            models_results[setname].uncorrelated_features[k] = \
                deepcopy(models_results[setname].features[k])
            
    #compute VIF
    df = pd.DataFrame(models_results["Full"].uncorrelated_features)
    vif = pd.DataFrame()
    #scale data before computing VIF
    df = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
    vif["features"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    # histogram of VIF
    for v in vif.values:
        if v[1] > 160:
            #print(v[0], v[1])
            for setname in fullsetnames:
                if v[0] in models_results[setname].uncorrelated_features:
                    del models_results[setname].uncorrelated_features[v[0]]
    
    comptuseperset = {}
    for setname in list(supersetnames)+["Full"]:
        comptuseperset[setname] = 0
    
    perc_split = 0.2
    for setname in list(supersetnames)+["Full"]:
       #print("Running PLS search for dataset: ", setname)
    
       X, Y, features_names = \
          commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                  models_results[setname].labels)
       setlist = models_results[setname].setnames
       supersetlist = models_results[setname].supersetnames
       maxcomp = X.shape[1]
       ncomps, rmses, r2s, wtmads, loormses = \
              models.pls_model (X, Y, supersetlist, setlist, \
              ncomp_start = 1, ncomp_max = maxcomp, split = False,\
              plot = False)
       r2max_comps = np.argmax(r2s)+1
       rmsemin_comps = np.argmin(rmses)+1
       wtmadmin_comps = np.argmin(wtmads)+1
       loormsemin_comps = np.argmin(loormses)+1
    
       compstouse = wtmadmin_comps
       comptuseperset[setname] = compstouse 
    
    for setname in list(supersetnames)+["Full"]:   
       #print("Running PLS for dataset: ", setname)
       #print("  Using ", comptuseperset[setname], " components")
       compstouse = comptuseperset[setname]
       X, Y, features_names = \
          commonutils.build_XY_matrix (models_results[setname].uncorrelated_features, \
                  models_results[setname].labels)
       setlist = models_results[setname].setnames
       models_results[setname].plsmodel = PLSRegression(n_components=compstouse)
       models_results[setname].plsmodel.fit(X, Y)
       models_results[setname].y_pred = \
          models_results[setname].plsmodel.predict(X) 
       
       cv = LeaveOneOut()
       model = PLSRegression(n_components=compstouse)
       scores = cross_val_score(model, X, Y, \
                scoring='neg_mean_squared_error', \
                cv=cv, n_jobs=-1)
       loormse = np.sqrt(np.mean(np.absolute(scores)))
       rmse = mean_squared_error(Y, models_results[setname].y_pred, squared=False)
       r2 = r2_score(Y, models_results[setname].y_pred)
       y_pred = models_results[setname].y_pred
       if len(y_pred.shape) == 2:
                y_pred = y_pred[:,0]
       wtmadf = commonutils.wtmad2(setlist, Y, y_pred)
       wtmad = wtmadf[setname]
    
    pls_model_full = models_results["Full"].plsmodel
    ypredFull = []
    YFull = []
    setnamesFull = []
    
    setoffeatures = set()

    for ssetname in supersetnames:
        sublistset = set()
        for f in models_results[ssetname].uncorrelated_features:
            setoffeatures.add(f)
            sublistset.add(f)
        if sublistset != setoffeatures:
            print("Error in set of features")
            sys.exit(1)
    
        pls_model_ssetname = models_results[ssetname].plsmodel
        X, Y, features_names = \
            commonutils.build_XY_matrix (models_results[ssetname].uncorrelated_features, \
                                        models_results[ssetname].labels)
        setlist = models_results[ssetname].setnames
        setnamesFull.extend(setlist)
        YFull.extend(list(Y))
    
        y_pred = pls_model_ssetname.predict(X)
        if len(y_pred.shape) == 2:
            y_pred = y_pred[:,0]
        ypredFull.extend(list(y_pred))
        rmse = mean_squared_error(Y, y_pred, squared=False)
        wtmad2df = commonutils.wtmad2(setlist, Y, y_pred)
        wtmad2 = wtmad2df[ssetname]
    
        y_pred_full = pls_model_full.predict(X)
        if len(y_pred_full.shape) == 2:
            y_pred_full = y_pred_full[:,0]
        rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
        wtmad2_fulldf = commonutils.wtmad2(setlist, Y, y_pred_full)
        wtmad2_full = wtmad2_fulldf[ssetname]
    
        print("%4d , %30s , %5d , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f"%\
              (mainmodelsID, ssetname, len(Y), \
               wtmad2, wtmad2_full, \
               models_results[ssetname].bestinsidemethod_wtmad, \
               models_results[ssetname].bestourmethod_wtmad, \
               rmse, rmse_full, \
               models_results[ssetname].bestinsidemethod_rmse,
               models_results[ssetname].bestourmethod_rmse))
        
    ssetname = "Full"
    X, Y, features_names = \
            commonutils.build_XY_matrix (models_results["Full"].uncorrelated_features, \
                                        models_results["Full"].labels)
    setlist = models_results["Full"].setnames
    y_pred_full = pls_model_full.predict(X)
    
    wtmad2df_full = commonutils.wtmad2(setlist, Y, y_pred_full)
    wtmad2_full = wtmad2df_full["Full"]
    rmse_full = mean_squared_error(Y, y_pred_full, squared=False)
    
    rmse = mean_squared_error(YFull, ypredFull, squared=False)
    wtmad2df = commonutils.wtmad2(setnamesFull, YFull, ypredFull)
    wtmad2 = wtmad2df["Full"]
    rmse = mean_squared_error(YFull, ypredFull, squared=False)
    print("%4d , %30s , %5d , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f , %7.3f"%\
              (mainmodelsID, ssetname, len(Y), \
               wtmad2, wtmad2_full, \
               models_results[ssetname].bestinsidemethod_wtmad, \
               models_results[ssetname].bestourmethod_wtmad, \
               rmse, rmse_full, \
               models_results[ssetname].bestinsidemethod_rmse,
               models_results[ssetname].bestourmethod_rmse))
    
    return str(setoffeatures)[1:-1] 

In [None]:
#functionals = ["PBE", "PBE0", "TPSS", "TPSSh"]
#basis_sets = ['MINIX', 'SVP', 'TZVP']
CORRCUT = 0.95

modelsdata = []

basis_sets_l = [['MINIX'], 
                ['SVP'], 
                ['TZVP'],
                ['MINIX', 'SVP'],
                ['MINIX', 'TZVP'],
                ['SVP', 'TZVP'],
                ['MINIX', 'SVP', 'TZVP']]
functionals_l = [["PBE0"], 
                 ["PBE"], 
                 ["TPSS"], 
                 ["TPSSh"], 
                 ["PBE", "TPSS", "TPSSh"], 
                 ["PBE0", "TPSS", "TPSSh"]]
selected_basisset_l = ["TZVP", 'SVP', 'MINIX']
selected_functional_l = ["PBE0", "PBE", "TPSS", "TPSSh"]

mainmodelsID = 1
print("ModelID, Setname , Dim , wtmad2 , wtmad2_full , "+ \
          "bestinsidemethod_wtmad2 , bestourmethod_wtmad2 ,"+ \
            "rmse , rmse_full , bestinsidemethod_rmse , bestourmethod_rmse")
for selected_functional in selected_functional_l:
  for selected_basisset in selected_basisset_l:
    for  basis_sets in basis_sets_l:
      for functionals in functionals_l:
       
        usedfeats = mainfunc(mainmodelsID, allfeatures, CORRCUT, \
          selected_functional, selected_basisset, \
          functionals, basis_sets, fullsetnames, \
          allvalues_perset, models_results, supersetnames)

        modelsdata.append([mainmodelsID, \
                        selected_functional, \
                        selected_basisset, \
                        functionals, \
                        basis_sets, \
                        usedfeats])

        mainmodelsID += 1

In [None]:
for d in modelsdata:
    print(d)