In [1]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit
import os 

import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import (LeaveOneOut, cross_val_predict,
                                     cross_val_score, train_test_split)

import numpy as np 

from rdkit.Chem import Descriptors
#print(len(Descriptors._descList))
#print(Descriptors._descList[:5])
#print(rdkit.__version__)

from contextlib import contextmanager,redirect_stderr,redirect_stdout
from os import devnull

@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield (err, out)

DEfine basic data structures and config

In [2]:
# those are the output data
molnames = []
labels = []
diffs_toothermethods = []
chemical_reacts = []
stechio_ceofs = []
moldescriptors = []

howmanydifs = 3
rootdirqdata = './qdata/'
rootdirdata = './data/'

Read molecules labels and more

In [3]:

fp = open(rootdirdata + '/labels.txt', 'r')

for line in fp:
    sline = line.replace("\t", " ").replace("\n", "").rstrip().lstrip().split()
    molname = sline[1]
    
    difvals = []   
    for i in range(howmanydifs):
        difvals.append(float(sline[-1*i]))

    schechio = []
    reacts = []
    for i in range(2,len(sline)-howmanydifs-1):
        nospace = sline[i].replace(" ", "")
        if nospace.isdigit():
            schechio.append(int(nospace))
        elif nospace.startswith("-") and nospace[1:].isdigit():
            schechio.append(int(nospace))
        else:
            reacts.append(nospace)

    stechio_ceofs.append(schechio)
    chemical_reacts.append(reacts)
    diffs_toothermethods.append(difvals)
    labels.append(float(sline[-1*howmanydifs-1]))
    molnames.append(molname)
    moldescriptors.append({})

fp.close()

print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


Read PBE data

In [4]:
import re

pbelist = ["Nuclear Repulsion  :", \
            "One Electron Energy:", \
            "Two Electron Energy:", \
            "Potential Energy   :", \
            "Kinetic Energy     :", \
            "E(X)               :"  , \
            "E(C)               :"  , \
            "Dispersion correction", \
            "Total Charge"   , \
            "Multiplicity"   , \
            "Number of Electrons"]

pbedescriptor = {}

for file in os.listdir(rootdirqdata+'/PBE/'):
    if file.endswith('.out'):
        molname = file.split('.out')[0]
        molname = re.split("\.mpi\d+", molname)[0]
        #print(molname)
        moldesc = {}
        fp = open(rootdirqdata+'/PBE/'+file, 'r')
        for line in fp:
            for val in pbelist:
                if line.find(val) != -1:
                    keyval = val.replace(":", "").rstrip().lstrip().replace(" ", "_")
                    sline = line.rstrip().lstrip().split()
                    for sval in sline:
                        try:
                            firstnumvalue = float(sval)
                            break
                        except:
                            continue
                    
                    moldesc[keyval] = firstnumvalue
                    #print(molname, keyval, sval)
        fp.close()
        pbedescriptor[molname] = moldesc

for i, molname in enumerate(molnames):
    if molname in pbedescriptor:
        moldescriptors[i]["PBE_Total_Charge"] = pbedescriptor[molname]["Total_Charge"]
        moldescriptors[i]["PBE_Multiplicity"] = pbedescriptor[molname]["Multiplicity"]
        moldescriptors[i]["PBE_Number_of_Electrons"] = pbedescriptor[molname]["Number_of_Electrons"]
        moldescriptors[i]["PBE_Nuclear_Repulsion"] = pbedescriptor[molname]["Nuclear_Repulsion"]
        moldescriptors[i]["PBE_One_Electron_Energy"] = pbedescriptor[molname]["One_Electron_Energy"]
        moldescriptors[i]["PBE_Two_Electron_Energy"] = pbedescriptor[molname]["Two_Electron_Energy"]
        moldescriptors[i]["PBE_Potential_Energy"] = pbedescriptor[molname]["Potential_Energy"]
        moldescriptors[i]["PBE_Kinetic_Energy"] = pbedescriptor[molname]["Kinetic_Energy"]
        moldescriptors[i]["PBE_E(X)"] = pbedescriptor[molname]["E(X)"]
        moldescriptors[i]["PBE_E(C)"] = pbedescriptor[molname]["E(C)"]
        moldescriptors[i]["PBE_Dispersion_correction"] = \
            pbedescriptor[molname]["Dispersion_correction"]
    else:
        print(molname + " not found in PBE descriptors")

print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


Read HF data

In [5]:

hflist = ["Nuclear Repulsion  :", \
          "One Electron Energy:", \
          "Two Electron Energy:", \
          "Potential Energy   :", \
          "Kinetic Energy     :", \
          "Dispersion correction", \
          "Total Charge", \
          "Multiplicity", \
          "Number of Electrons"]

hfdescriptor = {}

for file in os.listdir(rootdirqdata+'/HF/'):
    if file.endswith('.out'):
        molname = file.split('.out')[0]
        molname = re.split("\.mpi\d+", molname)[0]
        #print(molname)
        moldesc = {}
        fp = open(rootdirqdata+'/HF/'+file, 'r')
        for line in fp:
            for val in hflist:
                if line.find(val) != -1:
                    keyval = val.replace(":", "").rstrip().lstrip().replace(" ", "_")
                    sline = line.rstrip().lstrip().split()
                    for sval in sline:
                        try:
                            firstnumvalue = float(sval)
                            break
                        except:
                            continue
                    
                    moldesc[keyval] = firstnumvalue
                    #print(molname, keyval, sval)
        fp.close()
        hfdescriptor[molname] = moldesc

for i, molname in enumerate(molnames):
    if molname in pbedescriptor:
        moldescriptors[i]["HF_Total_Charge"] = hfdescriptor[molname]["Total_Charge"]
        moldescriptors[i]["HF_Multiplicity"] = hfdescriptor[molname]["Multiplicity"]
        moldescriptors[i]["HF_Number_of_Electrons"] = hfdescriptor[molname]["Number_of_Electrons"]
        moldescriptors[i]["HF_Nuclear_Repulsion"] = hfdescriptor[molname]["Nuclear_Repulsion"]
        moldescriptors[i]["HF_One_Electron_Energy"] = hfdescriptor[molname]["One_Electron_Energy"]
        moldescriptors[i]["HF_Two_Electron_Energy"] = hfdescriptor[molname]["Two_Electron_Energy"]
        moldescriptors[i]["HF_Potential_Energy"] = hfdescriptor[molname]["Potential_Energy"]
        moldescriptors[i]["HF_Kinetic_Energy"] = hfdescriptor[molname]["Kinetic_Energy"]
        moldescriptors[i]["HF_Dispersion_correction"] = \
            hfdescriptor[molname]["Dispersion_correction"]
    else:
        print(molname + " not found in HF descriptors")


print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


Remove molecules with some missing descriptor

In [6]:
alldims = set([len(val) for val in moldescriptors])
idxtoremovs = []
for i, val in enumerate(moldescriptors):
    if len(val) != max(alldims):
        idxtoremovs.append(i)

for i in sorted(idxtoremovs, reverse=True):
    print("Molname to remove:", molnames[i], "index:", i)
    del moldescriptors[i]
    del labels[i]
    del molnames[i]
    del diffs_toothermethods[i]
    del chemical_reacts[i]
    del stechio_ceofs[i]

print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


Remove molecules with None Label 

In [7]:
for i, v in enumerate(labels):
    if v is None:
        print("None value found in labels:", i, molnames[i])
        del moldescriptors[i]
        del labels[i]
        del molnames[i]
        del diffs_toothermethods[i]
        del chemical_reacts[i]
        del stechio_ceofs[i]

print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


If a descriptor is nan at least for a molecule remove from all

In [8]:
import math 

nandescriptors = set()
for index, molname in enumerate(molnames):
    if any(math.isnan(val) for val in moldescriptors[index].values()):
        print("Nan value found in descriptors:", molname)
        for k,v in moldescriptors[index].items():
            if math.isnan(v):
                nandescriptors.add(k)

print("Removing the following Descriptors ", nandescriptors)
print("Removing ", len(nandescriptors), " descriptors")
for i, v in enumerate(moldescriptors):
    for k in nandescriptors:
        del moldescriptors[i][k]


print("")
print("Number of descriptors:", len(moldescriptors[0]))
print("")
print("")
print("Number of labels         :", len(labels))
print("Number of names          :", len(molnames))
print("Number of differences    :", len(diffs_toothermethods))
print("Number of chemicals      :", len(chemical_reacts))
print("Number of stechio ceofs  :", len(stechio_ceofs))
print("Number of moldescriptors :", len(moldescriptors))

Removing the following Descriptors  set()
Removing  0  descriptors

Number of descriptors: 20


Number of labels         : 140
Number of names          : 140
Number of differences    : 140
Number of chemicals      : 140
Number of stechio ceofs  : 140
Number of moldescriptors : 140


Read chemicals

In [9]:
pbelist = ["Nuclear Repulsion  :", \
            "One Electron Energy:", \
            "Two Electron Energy:", \
            "Potential Energy   :", \
            "Kinetic Energy     :", \
            "E(X)               :"  , \
            "E(C)               :"  , \
            "Dispersion correction", \
            "Total Charge"   , \
            "Multiplicity"   , \
            "Number of Electrons"]

pbedescriptor = {}
for v in chemical_reacts:
    for chem in v:

        fp = open(rootdirqdata + '/PBE/'+chem+'.out', 'r')
        for line in fp:
            for val in pbelist:
                if line.find(val) != -1:
                    keyval = val.replace(":", "").rstrip().lstrip().replace(" ", "_")
                    sline = line.rstrip().lstrip().split()
                    for sval in sline:
                        try:
                            firstnumvalue = float(sval)
                            break
                        except:
                            continue
                    
                    moldesc[keyval] = firstnumvalue
        fp.close()

        pbedescriptor[chem] = moldesc

print("")
print(pbedescriptor)


{'h': {'Total_Charge': 0.0, 'Multiplicity': 1.0, 'Number_of_Electrons': 4.0, 'Nuclear_Repulsion': 0.0, 'One_Electron_Energy': -19.16643248, 'Two_Electron_Energy': 4.64776258, 'Potential_Energy': -29.28434581, 'Kinetic_Energy': 14.76567591, 'Dispersion_correction': 0.0, 'E(X)': -2.683079126669, 'E(C)': -0.087061447107}, 'al': {'Total_Charge': 0.0, 'Multiplicity': 1.0, 'Number_of_Electrons': 4.0, 'Nuclear_Repulsion': 0.0, 'One_Electron_Energy': -19.16643248, 'Two_Electron_Energy': 4.64776258, 'Potential_Energy': -29.28434581, 'Kinetic_Energy': 14.76567591, 'Dispersion_correction': 0.0, 'E(X)': -2.683079126669, 'E(C)': -0.087061447107}, 'si': {'Total_Charge': 0.0, 'Multiplicity': 1.0, 'Number_of_Electrons': 4.0, 'Nuclear_Repulsion': 0.0, 'One_Electron_Energy': -19.16643248, 'Two_Electron_Energy': 4.64776258, 'Potential_Energy': -29.28434581, 'Kinetic_Energy': 14.76567591, 'Dispersion_correction': 0.0, 'E(X)': -2.683079126669, 'E(C)': -0.087061447107}, 'b': {'Total_Charge': 0.0, 'Multipli