In [33]:
from __future__ import print_function
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pubchempy
from pandas import Series, DataFrame
from rdkit import DataStructs
from rdkit.Chem import AllChem
from IPython.core.pylabtools import figsize

%matplotlib inline
figsize(16, 8)

# Load data

In [34]:
df = pd.read_csv('../data/TableS2_training.csv', keep_default_na=False)

# Calculate InChI Key
df['InChI_Key'] = df['InChI'].apply(lambda x: Chem.inchi.InchiToInchiKey(x))

In [35]:
metabolite1 = df.iloc[5]
metabolite2 = df.iloc[7]

In [36]:
df[['Name', 'RRF_Measured', 'RRF_Predicted']].iloc[[5, 7]]

Unnamed: 0,Name,RRF_Measured,RRF_Predicted
5,L-Valine,0.188,0.263
7,Nicotinamide,0.322,0.755


In [37]:
mol1 = Chem.MolFromInchi(metabolite1['InChI'])
mol2 = Chem.MolFromInchi(metabolite2['InChI'])

In [38]:
fp1 = AllChem.GetMorganFingerprint(mol1, 2, useFeatures=True)
fp2 = AllChem.GetMorganFingerprint(mol2, 2, useFeatures=True)

# Morgan Fingerprints (Circular Fingerprints)

## Dice Similarity

In [39]:
DataStructs.DiceSimilarity(fp1,fp2)

0.09090909090909091

## Tanimoto

In [40]:
DataStructs.TanimotoSimilarity(fp1, fp2)

0.047619047619047616

# Topological fingerprints

In [41]:
from rdkit.Chem.Fingerprints import FingerprintMols

f1 = FingerprintMols.FingerprintMol(mol1)
f2 = FingerprintMols.FingerprintMol(mol2)

## Tanimoto on Topological fingerprint

In [42]:
DataStructs.FingerprintSimilarity(f1,f2, metric=DataStructs.TanimotoSimilarity)

0.38461538461538464