## Smiles

In [4]:
import sys
import numpy as np
import pandas as pd
from cdk_pywrapper.cdk_pywrapper import Compound
from scipy import spatial
from io import StringIO 

class Capturing(list):
  def __enter__(self):
    self._stdout = sys.stdout
    sys.stdout = self._stringio = StringIO()
    return self
  def __exit__(self, *args):
    self.extend(self._stringio.getvalue().splitlines())
    del self._stringio
    sys.stdout = self._stdout

class SmilesCharacteristics:
  def __init__(self, Smiles, saveMapFinger=False, saveSimil=False):
    self.data = {}
    for n, smile in enumerate(Smiles):
      cmpnd = Compound(compound_string=smile, identifier_type='smiles')
      with Capturing() as fingerprint:
        cmpnd.get_fingerprint().asBitSet()
      fingerprint = list(eval(fingerprint[1]))
      self.data[n]={
        'smiles_isomeric':cmpnd.get_smiles(smiles_type='isomeric'),
        'smiles_unique':cmpnd.get_smiles(smiles_type='unique'),
        'smiles_absolute':cmpnd.get_smiles(smiles_type='absolute'),
        'smiles_generic':cmpnd.get_smiles(smiles_type='generic'),
        'inchi_key':cmpnd.get_inchi_key(),
        'inchi':cmpnd.get_inchi(),
        'fingerprint':np.array(sorted(fingerprint)),
        }
    self.metrics(Smiles)
    if saveMapFinger:
      pd.DataFrame(self.mapFinger)
    if saveSimil:
      pd.DataFrame(self.simil)
      

  def metrics(self,Smiles):
    self.mapFinger = np.zeros([len(Smiles), 1024])
    for n in range(len(Smiles)):
      self.mapFinger[n][self.data[n]['fingerprint']]=1
    self.simil = np.zeros([len(Smiles), len(Smiles)])*np.nan
    for n1 in range(len(Smiles)):
      for n2 in range(len(Smiles)):
        self.simil[n1,n2] = 1-spatial.distance.cosine(self.mapFinger[n1,:], self.mapFinger[n2,:])

map2 = pd.read_csv('/proposal/data/matched_mibig_gnps_update.csv')
print(map2.shape)
map2.head(5)

In [None]:
data_mibig = {}
for n, smile in enumerate(map2['mibig_smiles']):
  try:
    cmpnd = Compound(compound_string=smile, identifier_type='smiles')
    with Capturing() as fingerprint:
      cmpnd.get_fingerprint().asBitSet()
    fingerprint = list(eval(fingerprint[1]))
    data_mibig[n]={
      'smiles_isomeric':cmpnd.get_smiles(smiles_type='isomeric'),
      'smiles_unique':cmpnd.get_smiles(smiles_type='unique'),
      'smiles_absolute':cmpnd.get_smiles(smiles_type='absolute'),
      'smiles_generic':cmpnd.get_smiles(smiles_type='generic'),
      'inchi_key':cmpnd.get_inchi_key(),
      'inchi':cmpnd.get_inchi(),
      'fingerprint':np.array(sorted(fingerprint)),
      }
  except:
    pass
len(data_mibig)

In [None]:
mapFinger_mibig = np.zeros([len(data_mibig), 1024])
for n in range(len(data_mibig)):
  mapFinger_mibig[n][data_mibig[n]['fingerprint']]=1
  
simil_mibig = np.zeros([len(data_mibig), len(data_mibig)])*np.nan
for n1 in range(len(data_mibig)):
  for n2 in range(len(data_mibig)):
    simil_mibig[n1,n2] = 1-spatial.distance.cosine(mapFinger_mibig[n1,:], mapFinger_mibig[n2,:])

df_mibig = pd.DataFrame(simil_mibig)
df_mibig.to_csv('matrix.csv')