In [None]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit
import os 

import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import (LeaveOneOut, cross_val_predict,
                                     cross_val_score, train_test_split)

import numpy as np 

from rdkit.Chem import Descriptors
#print(len(Descriptors._descList))
#print(Descriptors._descList[:5])
#print(rdkit.__version__)

from contextlib import contextmanager,redirect_stderr,redirect_stdout
from os import devnull

@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield (err, out)

Read molecules labels and generate standard descriptors

In [None]:
# those are the output data
moldescriptors = []
molnames = []
labels = []
molecules = []

fp = open('./data/labels.txt', 'r')

for line in fp:
    sline = line.replace("\t", " ").replace("\n", "").rstrip().lstrip().split()
    molname = sline[1]

    molnames.append(molname)
    labels.append(float(sline[-1]))
    molecules.append(None)
    moldescriptors.append(None)

fp.close()

idxtorm = []
for dir in os.listdir('./data'):
    if os.path.isdir('./data/'+dir):
        if dir in molnames:
            index = molnames.index(dir)
            for file in os.listdir('./data/'+dir):
                if file.endswith('.pdb'):
                    m = None
                    with suppress_stdout_stderr():
                        m = rdkit.Chem.rdmolfiles.MolFromPDBFile('./data/'+dir+"/"+file)
    
                    if m is None:
                        print("Error reading file:", dir)
                        idxtorm.append(index)
                    else:
                        molecules[index] = m
    
# generate the fingerprints and descriptors
fingersize = 64
rdgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=fingersize)

for idx, m in enumerate(molecules):
    #np_bits = rdgen.GetFingerprintAsNumPy(m)
    res = {}
    np_counts = None
    if m is not None:
        np_counts = rdgen.GetCountFingerprintAsNumPy(m)

    res["Fingerprint"] = np_counts

    missingVal=None
    for nm,fn in Descriptors._descList:
        val = None
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            # redirect stdout to supress the warning messages when calling a funxtion
            # that is deprecated
            with suppress_stdout_stderr():
                val = fn(m)
        except:
            # print the error message:
            #import traceback
            #traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal

        res[nm] = val

    moldescriptors[idx] = res

# need to add charge and spin to the descriptors
for dir in os.listdir('./data'):
    if os.path.isdir('./data/'+dir):
        if dir in molnames:
            index = molnames.index(dir)

            if os.path.isfile("./data/"+dir+"/.CHRG") and \
                os.path.isfile("./data/"+dir+"/.UHF"):
            
                fp = open("./data/"+dir+"/.CHRG", "r")
                charge = float(fp.readline().rstrip().lstrip())
                fp.close()
                fp = open("./data/"+dir+"/.UHF", "r")
                spin = float(fp.readline().rstrip().lstrip())
                fp.close()
                moldescriptors[index]["Charge"] = charge
                moldescriptors[index]["Spin"] = spin 
            else:
                print("Charge, UHF, molname not found:", dir)
                idxtorm.append(index)

for i in range(len(moldescriptors)):
    for k,v in moldescriptors[i].items():
        if v is None:
            moldescriptors[i][k] = 0.0
        elif v is np.nan:   
            moldescriptors[i][k] = 0.0 

# remove all molecules without a label
for i, val in enumerate(labels):
    if val is None:
        print("Label not presente for:", molnames[i])
        idxtorm.append(index)

for i in sorted(idxtorm, reverse=True):
    del labels[i]
    del moldescriptors[i]
    del molnames[i]
    del molecules[i]
        
print("")
print("Number of molecules:", len(molecules))
print("Number of labels:", len(labels))
print("Number of descriptors:", len(moldescriptors))
print("Number of names:", len(molnames))

Read PBE data

In [None]:
import re

pbelist = ["Nuclear Repulsion  :", \
            "One Electron Energy:", \
            "Two Electron Energy:", \
            "Potential Energy   :", \
            "Kinetic Energy     :", \
            "E(X)               :"  , \
            "E(C)               :"  , \
            "Dispersion correction", \
            "Total Charge"   , \
            "Multiplicity"   , \
            "Number of Electrons"]

pbedescriptor = {}

for file in os.listdir('./qdata/PBE/'):
    if file.endswith('.out'):
        molname = file.split('.out')[0]
        molname = re.split("\.mpi\d+", molname)[0]
        #print(molname)
        moldesc = {}
        fp = open('./qdata/PBE/'+file, 'r')
        for line in fp:
            for val in pbelist:
                if line.find(val) != -1:
                    keyval = val.replace(":", "").rstrip().lstrip().replace(" ", "_")
                    sline = line.rstrip().lstrip().split()
                    for sval in sline:
                        try:
                            firstnumvalue = float(sval)
                            break
                        except:
                            continue
                    
                    moldesc[keyval] = firstnumvalue
                    #print(molname, keyval, sval)
        fp.close()
        pbedescriptor[molname] = moldesc

for i, molname in enumerate(molnames):
    if molname in pbedescriptor:
        moldescriptors[i]["PBE_Total_Charge"] = pbedescriptor[molname]["Total_Charge"]
        moldescriptors[i]["PBE_Multiplicity"] = pbedescriptor[molname]["Multiplicity"]
        moldescriptors[i]["PBE_Number_of_Electrons"] = pbedescriptor[molname]["Number_of_Electrons"]
        moldescriptors[i]["PBE_Nuclear_Repulsion"] = pbedescriptor[molname]["Nuclear_Repulsion"]
        moldescriptors[i]["PBE_One_Electron_Energy"] = pbedescriptor[molname]["One_Electron_Energy"]
        moldescriptors[i]["PBE_Two_Electron_Energy"] = pbedescriptor[molname]["Two_Electron_Energy"]
        moldescriptors[i]["PBE_Potential_Energy"] = pbedescriptor[molname]["Potential_Energy"]
        moldescriptors[i]["PBE_Kinetic_Energy"] = pbedescriptor[molname]["Kinetic_Energy"]
        moldescriptors[i]["PBE_E(X)"] = pbedescriptor[molname]["E(X)"]
        moldescriptors[i]["PBE_E(C)"] = pbedescriptor[molname]["E(C)"]
        moldescriptors[i]["PBE_Dispersion_correction"] = \
            pbedescriptor[molname]["Dispersion_correction"]
    else:
        print(molname + " not found in PBE descriptors")


print("")
print("Number of molecules:", len(molecules))
print("Number of labels:", len(labels))
print("Number of descriptors:", len(moldescriptors))
print("Number of names:", len(molnames))        

Read HF data

In [None]:

hflist = ["Nuclear Repulsion  :", \
          "One Electron Energy:", \
          "Two Electron Energy:", \
          "Potential Energy   :", \
          "Kinetic Energy     :", \
          "Dispersion correction", \
          "Total Charge", \
          "Multiplicity", \
          "Number of Electrons"]

hfdescriptor = {}

for file in os.listdir('./qdata/PBE/'):
    if file.endswith('.out'):
        molname = file.split('.out')[0]
        molname = re.split("\.mpi\d+", molname)[0]
        #print(molname)
        moldesc = {}
        fp = open('./qdata/HF/'+file, 'r')
        for line in fp:
            for val in hflist:
                if line.find(val) != -1:
                    keyval = val.replace(":", "").rstrip().lstrip().replace(" ", "_")
                    sline = line.rstrip().lstrip().split()
                    for sval in sline:
                        try:
                            firstnumvalue = float(sval)
                            break
                        except:
                            continue
                    
                    moldesc[keyval] = firstnumvalue
                    #print(molname, keyval, sval)
        fp.close()
        hfdescriptor[molname] = moldesc

for i, molname in enumerate(molnames):
    if molname in pbedescriptor:
        moldescriptors[i]["HF_Total_Charge"] = hfdescriptor[molname]["Total_Charge"]
        moldescriptors[i]["HF_Multiplicity"] = hfdescriptor[molname]["Multiplicity"]
        moldescriptors[i]["HF_Number_of_Electrons"] = hfdescriptor[molname]["Number_of_Electrons"]
        moldescriptors[i]["HF_Nuclear_Repulsion"] = hfdescriptor[molname]["Nuclear_Repulsion"]
        moldescriptors[i]["HF_One_Electron_Energy"] = hfdescriptor[molname]["One_Electron_Energy"]
        moldescriptors[i]["HF_Two_Electron_Energy"] = hfdescriptor[molname]["Two_Electron_Energy"]
        moldescriptors[i]["HF_Potential_Energy"] = hfdescriptor[molname]["Potential_Energy"]
        moldescriptors[i]["HF_Kinetic_Energy"] = hfdescriptor[molname]["Kinetic_Energy"]
        moldescriptors[i]["HF_Dispersion_correction"] = \
            hfdescriptor[molname]["Dispersion_correction"]
    else:
        print(molname + " not found in HF descriptors")


print("")
print("Number of molecules:", len(molecules))
print("Number of labels:", len(labels))
print("Number of descriptors:", len(moldescriptors))
print("Number of names:", len(molnames))   

Remove molecules with some missing descriptor

In [None]:
alldims = set([len(val) for val in moldescriptors])
idxtoremovs = []
for i, val in enumerate(moldescriptors):
    if len(val) != max(alldims):
        idxtoremovs.append(i)

for i in sorted(idxtoremovs, reverse=True):
    print("Molname to remove:", molnames[i], "index:", i)
    del moldescriptors[i]
    del labels[i]
    del molnames[i]
    del molecules[i]

print("")
print("Number of molecules:", len(molecules))
print("Number of labels:", len(labels))
print("Number of descriptors:", len(moldescriptors))
print("Number of names:", len(molnames))  

Remove molecules with None Label 

In [None]:
for i, v in enumerate(labels):
    if v is None:
        print("None value found in labels:", i, molnames[i])
        del moldescriptors[i]
        del labels[i]
        del molnames[i]
        del molecules[i]

print("")
print("Number of molecules:", len(molecules))
print("Number of labels:", len(labels))
print("Number of descriptors:", len(moldescriptors))
print("Number of names:", len(molnames))  

In [None]:
moldescriptors_featues = []
Y = []
names = []

for idx, molname in enumerate(molnames):
    val = []
    for k,v in moldescriptors[idx].items():
        if k == "Fingerprint":
            val.extend(v)
        else:   
            val.append(v)
    moldescriptors_featues.append(val)
    Y.append(labels[idx])

Y = np.array(Y)
moldescriptors_featues = np.array(moldescriptors_featues)
moldescriptors_featues = np.nan_to_num(moldescriptors_featues, 0.0)

In [None]:
import models
%reload_ext models

perc_split = 0.2
models.pls_model (0.2, moldescriptors_featues, Y)

In [None]:
perc_split = 0.2
models.pls_model (0.2, moldescriptors_featues, Y, False, 7)

In [None]:
%reload_ext models
#perc_split = 0.2
#min_train_rmse_hyper, min_test_rmse_hyper, max_train_r2_hyper, max_test_r2_hyper = \
#    models.rf_model (0.2, moldescriptors_featues, Y)

In [None]:
%reload_ext models
#for d in [min_train_rmse_hyper, min_test_rmse_hyper, max_train_r2_hyper, max_test_r2_hyper]:
#    perc_split = 0.2
#    models.rf_model (0.2, moldescriptors_featues, Y, False, [d['n_estimators']], \
#                     [d['max_depth']], [d['min_samples_split']], \
#                         [d['min_samples_leaf']], [d['random_state']], \
#                             [d['bootstrap']], [d['max_features']])
