In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import commonutils
import models

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

from dataclasses import dataclass
import prettyprinter as pp

from sklearn.cross_decomposition import PLSRegression
import warnings
import sys

from sklearn import preprocessing

from copy import deepcopy
import pickle

In [2]:
warnings.simplefilter("ignore")

howmanydifs = 3
allvalues_perset = pickle.load(open("./data/allvalues_perset.p", "rb"))
methods = pickle.load(open("./data/methods.p", "rb"))
fullsetnames = pickle.load(open("./data/fullsetnames.p", "rb"))
functionals = pickle.load(open("./data/functionals.p", "rb"))
basis_sets = pickle.load(open("./data/basis_sets.p", "rb"))
supersetnames = pickle.load(open("./data/supersetnames.p", "rb"))

In [3]:
from importlib import reload
reload(commonutils)

from commonutils import ModelResults

allfeatures = set()
for setname in fullsetnames:
    for val in allvalues_perset[setname]:
        for k in val:
            if k.find("energydiff") != -1:
                for f in val[k]:
                    allfeatures.add(f)

# set labels and sets iists
models_results = {}
for setname in fullsetnames:
    models_results[setname] = ModelResults()
    for val in allvalues_perset[setname]:
        models_results[setname].labels.append(val["label"]) 
        models_results[setname].supersetnames.append(val["super_setname"])
        models_results[setname].setnames.append(val["super_setname"]+"_"+val["setname"])

insidemethods = ["W","D3(0)","D3(BJ)"]
for setname in fullsetnames:
    for methodid in range(howmanydifs):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val["label"] + val["difs"][methodid])

        wtmad = None
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                    models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestinsidemethod_wtmad:
                models_results[setname].bestinsidemethod_wtmad = wtmad
                models_results[setname].bestinsidemethod_wtmad_name = insidemethods[methodid]
                models_results[setname].y_pred_bestinsidemethod_wtmad = y_pred

        rmse = mean_squared_error(models_results[setname].labels, \
                                y_pred, squared=False)

        if rmse < models_results[setname].bestinsidemethod_rmse:
            models_results[setname].bestinsidemethod_rmse = rmse
            models_results[setname].bestinsidemethod_rmse_name = insidemethods[methodid]
            models_results[setname].y_pred_bestinsidemethod_rmse = y_pred

    for j, method in enumerate(methods):
        y_pred = []
        for val in allvalues_perset[setname]:
            y_pred.append(val[method + "_energydiff"][method+"_FINAL_SINGLE_POINT_ENERGY"])

        wtmad = None            
        fulllist = list(supersetnames.keys()) + ["Full"]
        if setname in fulllist:
            wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                models_results[setname].labels, y_pred)
            wtmad = wtmadf[setname]

            if wtmad < models_results[setname].bestourmethod_wtmad:
                models_results[setname].bestourmethod_wtmad = wtmad
                models_results[setname].bestourmethod_wtmad_name = method
                models_results[setname].y_pred_bestourmethod_wtmad = y_pred
        
        rmse = mean_squared_error(models_results[setname].labels,\
                                y_pred, squared=False)

        if rmse < models_results[setname].bestourmethod_rmse:
            models_results[setname].bestourmethod_rmse = rmse
            models_results[setname].bestourmethod_rmse_name = method
            models_results[setname].y_pred_bestourmethod_rmse = y_pred

In [None]:
#filter and generate equations
basicfeattouse = ["Potential_Energy", \
                "Kinetic_Energy", \
                "FINAL_SINGLE_POINT_ENERGY", \
                "Dispersion_correction", \
                "E(C)", \
                "E(X)", \
                "Two_Electron_Energy", \
                "Nuclear_Repulsion", \
                "One_Electron_Energy"]

featuresvalues_perset = {}
for setname in fullsetnames:
    featuresvalues_perset [setname] = []
    for val in allvalues_perset[setname]:
        featuresvalues_perset[setname].append({})
        for k in val:
            if k.find("energydiff") != -1:
                torm = k.replace("energydiff", "")
                for f in val[k]:
                    tocheck = f.replace(torm, "")
                    if tocheck in basicfeattouse:
                        keytouse = f.replace("-", "_")
                        keytouse = keytouse.replace("(", "")
                        keytouse = keytouse.replace(")", "")
                        featuresvalues_perset[setname][-1][keytouse] = val[k][f]


equations = {"EC" :"EC" , \
            "EX" : "EX", \
            "FSPE" : "FINAL_SINGLE_POINT_ENERGY", \
            "DC" : "Dispersion_correction", \
            "PE" : "Potential_Energy", \
            "KE" : "Kinetic_Energy", \
            "OEE" : "One_Electron_Energy", \
            "TEE" : "Two_Electron_Energy", \
            "NR" : "Nuclear_Repulsion"}

eq_featuresvalues_perset = \
    commonutils.equation_parser_compiler(equations, functionals, basis_sets, basicfeattouse, \
                              featuresvalues_perset)

featuresvalues_perset = deepcopy(eq_featuresvalues_perset)


In [None]:
selected_basisset = "SVP"
selected_functional = "PBE0"
functionals = ["PBE0"]
basis_sets = ["MINIX"]
# compute rmse for QZVP and selected functional 
selectednames = set()
for setname in fullsetnames:
    models_results[setname].y_pred_slectedfunc_qzbasis = []
    for features in featuresvalues_perset[setname]:
        for entry in features:
            if len(entry.split("_")) == 3:
                func = entry.split("_")[0]
                basis = entry.split("_")[1]
                value = entry.split("_")[2]
                if func == selected_functional and basis == "QZVP" and \
                    value == "FSPE":
                    selectednames.add(setname)
                    models_results[\
                        setname].y_pred_slectedfunc_qzbasis.append(\
                            features[entry])  
            else:
                print("WARNING: ", entry)   

    if len(selectednames) != 1:
        print("ERROR: ", selectednames)
        sys.exit(1)

    models_results[setname].slectedfunc_qzbasis_name = selectednames.pop()
    rmse = mean_squared_error(models_results[setname].labels, \
            models_results[setname].y_pred_slectedfunc_qzbasis,\
            squared=False)
    models_results[setname].slectedfunc_qzbasis_rmse = rmse

    if setname in list(supersetnames.keys()) +["Full"]:

        wtmadf = commonutils.wtmad2(models_results[setname].setnames, \
                                models_results[setname].labels, \
                                models_results[setname].y_pred_slectedfunc_qzbasis)
        wtmad = wtmadf[setname]
        models_results[setname].slectedfunc_qzbasis_wtmad = wtmad
    

sep = "_"
for setname in fullsetnames:
    desciptors = {}
    k = selected_functional + sep + \
            selected_basisset 
    for features in featuresvalues_perset[setname]:
        for val in features:
            if val.find(k) != -1:
                if val not in desciptors:
                    desciptors[val] = [features[val]]
                else:
                    desciptors[val].append(features[val])

    for features in featuresvalues_perset[setname]:
        for val in features:
            for func in functionals:
                for basis in basis_sets:
                    if not(basis == selected_basisset and \
                           func == selected_functional):
                        if val.find(func + sep + basis) != -1:
                            actualk = val 
                            refk  = selected_functional + sep  + selected_basisset + \
                                val.replace(func + sep + basis, "")
                            newk = actualk + "_difftoref"
                            if newk not in desciptors:
                                desciptors[newk] = [features[actualk]-features[refk]]
                            else:
                                desciptors[newk].append(features[actualk]-features[refk])
    
    models_results[setname].features = desciptors

# feastures selection
setname = "Full"
numoffeat = len(models_results[setname].features)
print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

toremove = []
setname = "Full"
for k in models_results[setname].features:
    if len(set(models_results[setname].features[k])) == 1:
        toremove.append(k)
        print("Constant fatures to remove: ", k)

# remove constant values
for setname in fullsetnames:
    #print("Removing constant features for ", setname)
    for k in toremove:
        #print("Constant fatures to remove: ", k)
        del models_results[setname].features[k]

# force removing features Nuclear Repulsion difference
print("Removing Nuclear Repulsion differences")
for setname in fullsetnames: 
    toremove = []
    for k in models_results[setname].features:
        if k.find("NR") != -1:
            toremove.append(k)
    for k in toremove:
        #print("Removing feature ", k)
        del models_results[setname].features[k]

setname = "Full"
numoffeat = len(models_results[setname].features)
print("Number of features for ", numoffeat)
for setname in fullsetnames:
    if len(models_results[setname].features) != numoffeat:
        print("Number of features for ", setname, " is different")
        sys.exit(1)

In [None]:
import importlib
importlib.reload(models)
importlib.reload(commonutils)

import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
import sys
sys.path.append("./CLossLr")
import customlosslr as clr

from commonutils import ModelsStore

models_store = {}
for setname in list(supersetnames)+["Full"]:
    models_store[setname] = ModelsStore()

    print("Running PLS for dataset: ", setname)
 
    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].features, \
              models_results[setname].labels)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                      test_size=0.20, random_state=42)
    setlist = models_results[setname].setnames  
    supersetlist = models_results[setname].supersetnames
    maxcomp = X.shape[1]
 
    ncomps, rmses, r2s, wtmads, loormses = \
          models.pls_model (X, Y, supersetlist, setlist, \
          ncomp_start = 1, ncomp_max = maxcomp, split = False,\
          plot = False, loo=False)
    r2max_comps = np.argmax(r2s)+1
    rmsemin_comps = np.argmin(rmses)+1
    wtmadmin_comps = np.argmin(wtmads)+1
    compstouse = min(r2max_comps, rmsemin_comps, wtmadmin_comps)
    print("  Using ", compstouse, " components")
    models_store[setname].plsmodel = PLSRegression(n_components=compstouse)
    
    cv = LeaveOneOut()
    model = PLSRegression(n_components=compstouse)
    scores = cross_val_score(model, X, Y, \
            scoring='neg_mean_squared_error', \
            cv=cv, n_jobs=-1)
    plsloormse = np.sqrt(np.mean(np.absolute(scores)))
    plsrmse = mean_squared_error(Y, models_results[setname].y_pred, squared=False)
    plsr2 = r2_score(Y, models_results[setname].y_pred)
    y_pred = models_results[setname].y_pred
    if len(y_pred.shape) == 2:
            y_pred = y_pred[:,0]
    wtmadf = commonutils.wtmad2(setlist, Y, y_pred)
    plswtmad = wtmadf[setname]

    #print("              PLS R2: %10.2f"%plsr2)
    print("           PLS WTMAD: %10.2f"%plswtmad)
    print("            PLS RMSE: %10.2f"%plsrmse)
    print("        PLS LOO RMSE: %10.2f"%plsloormse)
 
    best_rmse = 0.0
    best_ncomp = 0
    for ncomp in range(1, compstouse+1):
        model = PLSRegression(n_components=ncomp)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        if ncomp == 1:
            best_rmse = rmse
            best_ncomp = ncomp
        else:
            if rmse < best_rmse:
                best_rmse = rmse
                best_ncomp = ncomp
    model = PLSRegression(n_components=best_ncomp)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plsrmsetest = mean_squared_error(y_test, y_pred, squared=False)
    y_pred = model.predict(X_train)
    plsrmsetrain = mean_squared_error(y_train, y_pred, squared=False)

    print("      PLS Train RMSE: %10.2f"%plsrmsetrain)
    print("      PLS  Test RMSE: %10.2f"%plsrmsetest)
    print()

    lm = LinearRegression()
    lm.fit(X, Y)
    models_store[setname].lr_model = lm
    y_pred_lr = lm.predict(X)
    wtamd2 = commonutils.wtmad2(setlist, Y, y_pred_lr)
    wtmad_lr = wtamd2[setname]
    lrrmse = mean_squared_error(Y, y_pred_lr, squared=False)
    # use LOO to get the RMSE
    cv = LeaveOneOut()
    model = LinearRegression()
    scores = cross_val_score(model, X, Y, \
            scoring='neg_mean_squared_error', \
            cv=cv, n_jobs=-1)
    loolrrmse = np.sqrt(np.mean(np.absolute(scores)))

    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_pred_lr = lm.predict(X_test)
    lrrmsetest = mean_squared_error(y_test, y_pred_lr, squared=False)
    y_pred_lr = lm.predict(X_train)
    lrrmsetrain = mean_squared_error(y_train, y_pred_lr, squared=False)

    print("            LR WTMAD: %10.2f"%wtmad_lr)
    print("             LR RMSE: %10.2f"%lrrmse)
    print("         LR LOO RMSE: %10.2f"%loolrrmse)
    print("       LR Train RMSE: %10.2f"%lrrmsetrain)
    print("        LR Test RMSE: %10.2f"%lrrmsetest)
    print()

    clm = clr.custom_loss_lr (loss=clr.mean_absolute_percentage_error)
    clm.fit(X, Y)
    models_store[setname].lr_custom_model = clm
    y_pred_custom_lr = clm.predict(X)
    wtamd2 = commonutils.wtmad2(setlist, Y, y_pred_custom_lr)
    wtmad_custom_lr = wtamd2[setname]
    custom_lrrmse = mean_squared_error(Y, y_pred_custom_lr, squared=False)
    # use LOO to get the RMSE canno use need to implemente full estimator API 
    # https://scikit-learn.org/1.5/developers/develop.html
    #cv = LeaveOneOut()
    #model = clr.custom_loss_lr (loss=clr.mean_absolute_percentage_error)
    #scores = cross_val_score(model, X, Y, \
    #        scoring='neg_mean_squared_error', \
    #        cv=cv, n_jobs=-1)
    #loocustom_lrrmse = np.sqrt(np.mean(np.absolute(scores)))
    clm = clr.custom_loss_lr (loss=clr.mean_absolute_percentage_error)
    clm.fit(X_train, y_train)
    y_pred_custom_lr = clm.predict(X_test)
    custom_lrrmsetest = mean_squared_error(y_test, y_pred_custom_lr, squared=False)
    y_pred_custom_lr = clm.predict(X_train)
    custom_lrrmsetrain = mean_squared_error(y_train, y_pred_custom_lr, squared=False)

    print("     Custom LR WTMAD: %10.2f"%wtmad_custom_lr)
    print("      Custom LR RMSE: %10.2f"%custom_lrrmse)
    #print("  Custom LR LOO RMSE: %10.2f"%loocustom_lrrmse)
    print("Custom LR Train RMSE: %10.2f"%custom_lrrmsetrain)
    print(" Custom LR Test RMSE: %10.2f"%custom_lrrmsetest)
          

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

basissets_touse = set(basis_sets + [selected_basisset])
functional_to_use = set(functionals + [selected_functional])

classes = []
features = {}
supersetnameslist = list(supersetnames.keys())
for setname in featuresvalues_perset:
    if setname in supersetnameslist:
        print("Setname: ", setname)
        for entry in featuresvalues_perset[setname]:
            classes.append(supersetnameslist.index(setname))
            #print("Entry: ", entry)
            for featurename in entry:
                for functional in functional_to_use:
                    for basisset in basissets_touse:
                        if featurename.find(basisset) != -1 and \
                            featurename.find(functional) != -1:
                            if featurename not in features:
                                features[featurename] = []
                            features[featurename].append(entry[featurename])

#print("Classes: ", len(classes))
#for f in features:
#    print("Feature: ", f, " ", len(features[f]))
X = pd.DataFrame(features)
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.20, random_state=42)
accuracys = []
numoftrees = []
for ntrees in range(10, 200, 10):
    rf = RandomForestClassifier(n_estimators=ntrees, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracys.append(accuracy)
    numoftrees.append(ntrees)

bestaccuracy = max(accuracys)   
bestntrees = numoftrees[accuracys.index(bestaccuracy)]
print("Best accuracy: ", max(accuracys), " with ", bestntrees, " trees")

rf = RandomForestClassifier(n_estimators=bestntrees, random_state=42)
rf.fit(X_train, y_train)
testaccuracy = rf.score(X_test, y_test)
trainaccuracy = rf.score(X_train, y_train)
overallaccuracy = rf.score(X, classes)
print("  Train accuracy: %5.2f"%(trainaccuracy))
print("   Test accuracy: %5.2f"%(testaccuracy))
print("Overall accuracy: %5.2f"%(overallaccuracy))


In [None]:
setname = "Full"
pls_model_full = models_results[setname].plsmodel
lr_model_full = models_results[setname].lr_model
lr_custom_model_full = models_results[setname].lr_custom_model

ypredFull = []
setnamesFull = []
ypredFull_lr = []
setnamesFull_lr = []
ypredFull_lr_custom = []
setnamesFull_lr_custom = []

for ssetname in supersetnames:
    print("Supersetnam ", ssetname)    
    pls_model_ssetname = models_results[ssetname].plsmodel
    lr_model_ssetname = models_results[ssetname].lr_model
    lr_custom_model_ssetname = models_results[ssetname].lr_custom_model

    X, Y, features_names = \
        commonutils.build_XY_matrix (models_results[setname].features, \
                                    models_results[setname].labels)
    setlist = models_results[setname].setnames
    setnamesFull.extend(setlist)

    # PLS 
    y_pred = pls_model_ssetname.predict(X)
    ypredFull.extend(y_pred)
    if len(y_pred.shape) == 2:
        y_pred = y_pred[:,0]
    wtmad2df = commonutils.wtmad2(setlist, Y, y_pred)
    wtamd2 = wtmad2df[ssetname]
    ypredFull.extend(y_pred)
    print(" SS PLS WTMAD2","%7.3f"%(wtamd2))

    print("%5s WTMAD2 %7.3f"%(models_results[ssetname].bestourmethod_name_rmse, \
                            models_results[ssetname].bestinsidemethod_wtmad))
    