# Experiments with custom kernels

In [1]:
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from pandas import Series, DataFrame
from sklearn.decomposition import PCA
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from IPython.core.pylabtools import figsize

np.set_printoptions(formatter={'float': '{: 0.2f}'.format})

%matplotlib inline
figsize(16, 8)

# Load the Data and get fingerprints for the molecules

In [2]:
df_train = pd.read_csv('../data/TableS2_training_pubchem.csv', keep_default_na=False)
df_TS3 = pd.read_csv('../data/TableS3_training_pubchem.csv', keep_default_na=False)

df_train = df_train[df_train['Annotation'] == '']
df_train['RRF_Measured'] = df_train['RRF_Measured'] * 1e-2
df_train['RRF_Predicted'] = df_train['RRF_Predicted'] * 1e-2

df_TS3['RRF_Measured'] = df_TS3['RRF_Measured'] * 1e-2
df_TS3['RRF_Predicted'] = df_TS3['RRF_Predicted'] * 1e-2

In [3]:
df_train = df_train[['Name', 'Short Name', 'InChI', 'RRF_Measured', 'RRF_Predicted']]
df_TS3 = df_TS3[['Name', 'Short Name', 'InChI', 'RRF_Measured', 'RRF_Predicted']]

In [4]:
for df in [df_train, df_TS3]:
    df['mol'] = df['InChI'].apply(lambda x: Chem.MolFromInchi(x))
    df['fp'] = df['mol'].apply(lambda x: FingerprintMols.FingerprintMol(x))

# Preview the Data

In [5]:
fp = df_train.loc[3]['fp']
DataStructs.FingerprintSimilarity(fp, fp)

1.0

In [6]:
X_train = df_train[['fp']]
y_train = df_train[['RRF_Measured']]

X_test = df_TS3[['fp']]

# Kernel

In [7]:
# Experimental kernel
def kernel(a, b):
    K = np.zeros(shape=(a.size, b.size))
    
    for r in range(a.size):
        for c in range(b.size):
            similarity = DataStructs.FingerprintSimilarity(a.iloc[r]['fp'], b.iloc[c]['fp'])
            K[r, c] = similarity
    
    return K

In [8]:
K = kernel(X_train, X_train)
k_df = DataFrame(K)

In [9]:
k_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1.0,0.336538,0.310345,0.333333,0.204082,0.173913,0.327273,0.330189,0.252874,0.303226,...,0.370192,0.304569,0.363636,0.347032,0.37551,0.241379,0.383333,0.333333,0.372549,0.375
1,0.336538,1.0,0.384615,0.426087,0.6375,0.782051,0.451613,0.746479,0.438095,0.382353,...,0.479339,0.470085,0.476562,0.475806,0.476562,0.584158,0.46875,0.467213,0.476562,0.476562
2,0.310345,0.384615,1.0,0.548544,0.238372,0.229508,0.263636,0.389831,0.411111,0.344633,...,0.243077,0.176667,0.310078,0.25,0.31068,0.333333,0.27204,0.249231,0.306911,0.304
3,0.333333,0.426087,0.548544,1.0,0.234637,0.252688,0.29052,0.431034,0.463687,0.389831,...,0.274143,0.193333,0.333333,0.295181,0.335784,0.368421,0.311054,0.268519,0.311741,0.308765
4,0.204082,0.6375,0.238372,0.234637,1.0,0.68932,0.276498,0.60241,0.237805,0.230263,...,0.336634,0.367232,0.317992,0.346154,0.314286,0.446043,0.320833,0.30622,0.313725,0.316406
5,0.173913,0.782051,0.229508,0.252688,0.68932,1.0,0.313636,0.639535,0.279762,0.24375,...,0.362319,0.380435,0.3625,0.390476,0.352227,0.458904,0.370833,0.357143,0.360784,0.363281
6,0.327273,0.451613,0.263636,0.29052,0.276498,0.313636,1.0,0.467742,0.400881,0.398148,...,0.32376,0.314121,0.408776,0.346939,0.438479,0.39823,0.420561,0.315245,0.504032,0.503984
7,0.330189,0.746479,0.389831,0.431034,0.60241,0.639535,0.467742,1.0,0.471154,0.444444,...,0.508333,0.513043,0.492188,0.491935,0.492188,0.636364,0.496063,0.508333,0.492188,0.492188
8,0.252874,0.438095,0.411111,0.463687,0.237805,0.279762,0.400881,0.471154,1.0,0.407407,...,0.466981,0.40099,0.495798,0.452489,0.487705,0.359551,0.497908,0.46729,0.47451,0.476562
9,0.303226,0.382353,0.344633,0.389831,0.230263,0.24375,0.398148,0.444444,0.407407,1.0,...,0.560847,0.341709,0.422594,0.532663,0.439834,0.321637,0.449153,0.51269,0.417323,0.414062


In [10]:
# Eigenvalues
np.linalg.eigvals(K)

array([ 15.47,  2.94,  1.99,  1.61,  1.21,  1.07, -0.22,  0.99,  0.94,
        0.86,  0.72,  0.69,  0.66,  0.62,  0.04,  0.06,  0.08,  0.10,
        0.54,  0.15,  0.16,  0.51,  0.47,  0.47,  0.44,  0.42,  0.40,
        0.38,  0.37,  0.22,  0.22,  0.25,  0.26,  0.27,  0.32,  0.31])

**K is not positive definite !!!** 

In [11]:
# Symmetric
(K.transpose() == K).all()

True

# Predictions

In [12]:
L = np.linalg.cholesky(K)

LinAlgError: Matrix is not positive definite

In [None]:
# compute the mean at our test points.
Lk = np.linalg.solve(L, kernel(X, X_test))
mu = np.dot(Lk.T, np.linalg.solve(L, y))

In [None]:
# compute the variance at our test points.
K_ = kernel(X_test, X_test)
s2 = np.diag(K_) - np.sum(Lk**2, axis=0)
s = np.sqrt(s2)