In [None]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
import rdkit
import os 

import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import (LeaveOneOut, cross_val_predict,
                                     cross_val_score, train_test_split)

import numpy as np 

from rdkit.Chem import Descriptors
print(len(Descriptors._descList))
print(Descriptors._descList[:5])

print(rdkit.__version__)

In [None]:
molecules = []
fnames = []
lnames = []
labels = []

for dir in os.listdir('./data'):
    if os.path.isdir('./data/'+dir):
        for file in os.listdir('./data/'+dir):
            if file.endswith('.pdb'):
                m = rdkit.Chem.rdmolfiles.MolFromPDBFile('./data/'+dir+"/"+file)
                if m is None:
                    print("Error reading file:", file)
                else:
                    molecules.append(m)
                    fnames.append(dir.rstrip().lstrip())

fp = open('./data/labels.txt', 'r')

for line in fp:
    sline = line.rstrip().lstrip().split()
    lnames.append(sline[1])
    labels.append(float(sline[-1]))

fp.close()

fingersize = 64
rdgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=fingersize)

np_bits_l = []
np_counts_l = []
moldescriptors = []

for m in molecules:
    np_bits = rdgen.GetFingerprintAsNumPy(m)
    np_bits_l.append(np_bits)
    np_counts = rdgen.GetCountFingerprintAsNumPy(m)
    np_counts_l.append(np_counts)

    res = {}
    missingVal=None
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(m)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal

        res[nm] = val

    moldescriptors.append(res)

for i in range(len(moldescriptors)):
    for k,v in moldescriptors[i].items():
        if v is None:
            moldescriptors[i][k] = 0.0
        elif v is np.nan:   
            moldescriptors[i][k] = 0.0 

In [None]:
np_bits_features = []
np_counts_features = []
moldescriptors_featues = []
Y = []
names = []

s_fnames = set(fnames)
s_lnames = set(lnames)
s_int = s_fnames.intersection(s_lnames)
for molname in s_int:
    #get index of molname in fnames
    idx = fnames.index(molname)
    np_bits_features.append(np_bits_l[idx])
    np_counts_features.append(np_counts_l[idx])
    val = [i for i in moldescriptors[idx].values()]
    moldescriptors_featues.append(val)
    #get index of molname in lnames
    idx2 = lnames.index(molname)
    Y.append(labels[idx2])
    names.append(molname)

Y = np.array(Y)
np_bits_features = np.array(np_bits_features)
np_counts_features = np.array(np_counts_features)
moldescriptors_featues = np.array(moldescriptors_featues)

moldescriptors_featues = np.nan_to_num(moldescriptors_featues, 0.0)

In [None]:
import models 
%reload_ext models

perc_split = 0.2
models.pls_model (0.2, np_counts_features, Y)

In [None]:
perc_split = 0.2
models.pls_model (0.2, np_counts_features, Y, False, 3)

In [None]:
perc_split = 0.2
models.pls_model (0.2, np_bits_features, Y)

In [None]:
perc_split = 0.2
models.pls_model (0.2, np_bits_features, Y, False, 3)

In [None]:
perc_split = 0.2
models.pls_model (0.2, moldescriptors_featues, Y)

In [None]:
perc_split = 0.2
models.pls_model (0.2, moldescriptors_featues, Y, False, 3)

In [None]:
perc_split = 0.2
models.rf_model (0.2, moldescriptors_featues, Y)

In [None]:
%reload_ext autoreload
%autoreload 2
import models 

modelshapes = [[64, 32, 16, 8, 4, 2], 
                [64, 32, 16, 8, 4], 
                [64, 32, 16, 8], 
                [64, 32, 16], 
                [64, 32], 
                [64]]


perc_split = 0.2
nepochs=10


models.nndmodel (0.2, np_counts_features, Y, nepochs, modelshapes)