<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# General:
import pandas as pd
import numpy as np
import os
import csv
import subprocess
import time
import shutil
import glob
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
import seaborn as sns
import statistics
import pickle

# SciKit-Optimise:
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.utils import use_named_args

# SVM:
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# RDKit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolfiles, rdMolDescriptors
from rdkit.Chem import SDMolSupplier, Descriptors, Crippen, Lipinski, Fragments
from rdkit import DataStructs

# Misc.:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from scipy import stats
import statistics
import pickle
from mordred import Calculator, descriptors



In [2]:
absolute_dGoffset_path = './absolute_dGoffset/'

# dataset_path = '~/Dropbox/FreeSolv/dGlearn-FreeSolv-master/datasets/train_compiled/dGhydr_train.csv'
offset_col_name = 'dGoffset (kcal/mol)'

# set data processing configurations:
PCA_threshold = 0.95  # Keeps n dimensions for x variance explained
replicates = 30  # Number of replicates per subject model
n_calls = 40  # Number of Bayesian optimisation loops for hyperparameter optimisation, 40 is best for convergence, > 60 scales to very expensive
startpoint_BO = np.inf  # Point to consider top-performing model from (MAE/MAD); 1.0 = no improvement on test-set variance
ensemble_size = 10  # Amount of top-scoring models to retain per fold-dataset combination
# KFold parameters:
n_splits = 5  # Number of K-fold splits
random_state = 2  # Random number seed

split = 'dG(hydr)'
translated_subject = 'absolute'

In [3]:
train_df_save_loc = absolute_dGoffset_path + 'train_data.csv'
train_df = pd.read_csv(train_df_save_loc)
train_dr = absolute_dGoffset_path + 'train_dr/'

test_df_save_loc = absolute_dGoffset_path + 'test_data.csv'
test_dr = absolute_dGoffset_path + 'test_dr/'
test_df = pd.read_csv(test_df_save_loc)
test_ID = test_df.index

In [4]:
def split_dataset(dataset, n_splits, random_state):
    """KFold implementation for pandas DataFrame.
    (https://stackoverflow.com/questions/45115964/separate-pandas-dataframe-using-sklearns-kfold)"""
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    kfolds = []
    global offset_col_name
    
    for train, validate in kf.split(dataset):

        training = dataset.iloc[train]
        train_labels = training[offset_col_name]
        train_set = training.drop(offset_col_name, axis=1)

        validating = dataset.iloc[validate]
        validate_labels = validating[offset_col_name]
        validate_set = validating.drop(offset_col_name, axis=1)

        kfolds.append(
        [[train_set, validate_set],
        [train_labels, validate_labels]]
        )
    
    return kfolds


kfolds = split_dataset(train_df, n_splits, random_state)
kfolds

[[[         Unnamed: 0        PC1        PC2        PC3        PC4        PC5  \
   0    mobley_6733657  43.444918  18.981579 -10.888037  26.784724  -6.214981   
   2     mobley_628086  -5.297366  11.051943  19.858065  -2.107224  -3.799300   
   4     mobley_172879  19.921569   6.382570 -10.115909  -6.204500  -0.572902   
   5    mobley_2518989  71.839370 -14.092542   5.976715  22.345511 -22.564202   
   6    mobley_2213823 -11.550942  -9.687470  -4.788693   3.788572   0.695097   
   ..              ...        ...        ...        ...        ...        ...   
   594  mobley_4715906  -3.133712 -10.715325  -7.809966   2.780470   0.166830   
   595  mobley_5072416   0.747176   3.171931  -1.982931  -9.086671  -1.714663   
   597  mobley_2068538 -25.070087  -5.713134  -2.307109   6.475840   2.499863   
   598  mobley_4506634  -5.605277  22.814873  17.154333   8.506500  -3.024043   
   600   mobley_210639 -18.787038   3.984362  -4.947199  15.135230   1.940778   
   
              PC6       

In [5]:
def fingerprint_similarity(fold):
    
    # retrieve IDs
    
    train_IDs = fold[0][0].index.tolist()
    validate_IDs = fold[0][1].index.tolist()
    test_IDs = test_ID.tolist()

    # retrieve SDFs
    train_suppl = [Chem.SDMolSupplier(sdf) 
                   for sdf in glob.glob(train_dr + '*.sdf')]

    valdtn_suppl = [Chem.SDMolSupplier(train_dr + sdf + '.sdf') 
               for sdf in validate_IDs]

    test_suppl = [Chem.SDMolSupplier(test_dr + sdf + '.sdf') 
           for sdf in test_IDs]
    
    # generate fingerprints
    train_fp = [Chem.RDKFingerprint(mol) for mol in train_suppl]
    valdtn_fp = [Chem.RDKFingerprint(mol) for mol in valdtn_suppl]
    test_fp = [Chem.RDKFingerprint(mol) for mol in test_suppl]
    
    # compare fingerprints
    test_train_similarity = [DataStructs.FingerprintSimilarity(test_mol, train_mol)
                            for test_mol in test_fp
                            for train_mol in train_fp]
    
    test_valdtn_similarity = [DataStructs.FingerprintSimilarity(test_mol, valdtn_mol)
                        for test_mol in test_fp
                        for valdtn_mol in valdtn_fp]
    
    return test_train_similarity, test_valdtn_similarity


test_train, test_valdnt= fingerprint_similarity(kfolds[0])

OSError: File error: Bad input file ./absolute_dGoffset/train_dr/mobley_7047032.sdf

In [None]:
!cat ./absolute_dGoffset/train_dr/mobley_7047032.sdf