In [1]:
import numpy as np
import pandas as pd
import rdkit
import rdkit.Chem as Chem
from sklearn import metrics
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.base import BaseEstimator, TransformerMixin
from regression_model.processing.errors import InvalidModelInputError

class DropChollinearityVif(BaseEstimator, TransformerMixin):
    """Drop Chollinear parameters """

    def __init__(self, threshold=10) -> int:
        #Set threshold for ivf parameters: recommendation is 10-5
        #The definition of ivf = 1 - (1/r)
        self.threshold = threshold

    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "DropChollinearityVif":
        """Fit statement to accomodate the sklearn pipeline"""
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe: Smiles"""
        variables = list(range(X.shape[1]))
        dropped = True
        while dropped:
            dropped = False
            vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
                   for ix in range(X.iloc[:, variables].shape[1])]

            maxloc = vif.index(max(vif))
            if max(vif) > self.threshold:
                print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                      '\' at index: ' + str(maxloc))
                del variables[maxloc]
                dropped = True

        print('Remaining variables:')
        print(X.columns[variables])
        return X.iloc[:, variables]

In [3]:
import preprocess as pp
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

smiles = ["C1CCCCC1", "C[C@H](N)C(=O)O","O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5","OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2","O=Cc1ccc(O)c(OC)c1"]

conv = pp.SmilestoDescriptors(mode="rdkit")
X = conv.fit_transform(smiles)

X

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,1.5,1.5,1.5,1.5,0.422316,...,0,0,0,0,0
1,9.574074,-0.962963,9.574074,0.731481,0.451352,...,0,0,0,0,0
2,12.308708,-0.586298,12.308708,0.128046,0.752046,...,0,0,0,0,0
3,12.064817,-1.511854,12.064817,0.036759,0.422367,...,0,0,0,0,0
4,10.248745,0.039907,10.248745,0.039907,0.647744,...,0,0,0,0,0


In [6]:
conveter = DropChollinearityVif(threshold=10)
conveter.fit_transform(X)

dropping 'MaxEStateIndex' at index: 0
dropping 'MinEStateIndex' at index: 0
dropping 'MaxAbsEStateIndex' at index: 0
dropping 'MinAbsEStateIndex' at index: 0
dropping 'qed' at index: 0
dropping 'MolWt' at index: 0
dropping 'HeavyAtomMolWt' at index: 0
dropping 'ExactMolWt' at index: 0
dropping 'NumValenceElectrons' at index: 0
Remaining variables:
Index(['NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge',
       'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=191)


Unnamed: 0,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,...,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,-0.053306,-0.053306,0.053306,0.053306,...,0,0,0,0,0
1,0,0.319678,-0.480094,0.480094,0.319678,...,0,0,0,0,0
2,0,0.347013,-0.495835,0.495835,0.347013,...,0,0,0,0,0
3,0,0.338772,-0.504138,0.504138,0.338772,...,0,0,0,0,0
4,0,0.160689,-0.504254,0.504254,0.160689,...,0,0,0,0,0
