In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [5]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [6]:
base_data = pd.read_csv("data\processed\smiles_aggregated.csv")
base_data.head()

Unnamed: 0,mol,CID,Class,Model,pIC50,MW,...,PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),canvasUID
0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,BACE_1,1,Train,9.154901,431.56979,...,37.133846,0.0,7.98017,0.0,0.0,1
1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,BACE_2,1,Train,8.853872,657.81073,...,34.923889,7.98017,24.148668,0.0,24.663788,2
2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_3,1,Train,8.69897,591.74091,...,23.654478,0.230159,15.87979,0.0,24.663788,3
3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,BACE_4,1,Train,8.69897,591.67828,...,36.498634,0.980913,8.188327,0.0,26.385181,4
4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_5,1,Train,8.69897,629.71283,...,23.654478,0.230159,15.87979,0.0,26.100143,5


In [7]:
input_df = base_data[["mol", 'Class']]
input_df.head()

Unnamed: 0,mol,Class
0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,1
1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,1
2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,1
3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,1
4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,1


In [8]:
input_df = input_df.drop_duplicates(subset=['mol'])
input_df = input_df.dropna()

In [9]:
input_df['mol_from_smiles'] = input_df['mol'].apply(Chem.MolFromSmiles)

y = input_df["Class"]

calc = Calculator(descriptors, ignore_3D=False)
X_mordred = calc.pandas(input_df['mol_from_smiles'], nproc=1)
X_mordred = X_mordred.select_dtypes(['number'])
#normalize
X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
#drop columns wth low std
X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]


  0%|          | 0/1510 [00:00<?, ?it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
100%|██████████| 1510/1510 [08:54<00:00,  2.83it/s]


In [10]:
mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
X_morgan = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in input_df["mol_from_smiles"]])
X_morgan = pd.DataFrame(X_morgan, columns = ['mfp'+str(i) for i in range(X_morgan.shape[1])])
X_morgan.head()

Unnamed: 0,mfp0,mfp1,mfp2,mfp3,mfp4,mfp5,...,mfp2042,mfp2043,mfp2044,mfp2045,mfp2046,mfp2047
0,0,1,0,0,0,0,...,0,0,0,0,0,0
1,0,1,0,0,0,0,...,0,0,0,0,0,0
2,1,1,0,0,0,0,...,0,0,0,0,0,0
3,0,1,0,0,0,1,...,0,0,0,0,0,0
4,1,1,0,0,0,0,...,0,0,0,0,0,0


In [11]:
def train_GBT(X, y):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=100)),
        ('grid', GridSearchCV(GradientBoostingClassifier(),
                                 param_grid={'n_estimators': [10, 100, 1000], 'learning_rate': [0.1,0.5,1.0,2.0]},
                                 cv=4,
                                 refit=True))
        ])
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))
    return model

In [12]:
model = train_GBT(X_mordred, y)

0.8079470198675497


In [13]:
model = train_GBT(X_morgan, y)

0.8311258278145696


In [14]:
import pickle
filename = 'model_GBT_pipeline.sav'
pickle.dump(model, open(filename, 'wb'))