## One hot encoding

In [1]:
import numpy as np

# define SMILES characters ----------------------------------------------------
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']
                
# define encoder and decoder --------------------------------------------------
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder( smiles, maxlen=120 ):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

# get a taste of caffeine -----------------------------------------------------
caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'

caffeine_encoding = smiles_encoder(caffeine_smiles)

print(caffeine_encoding.shape) # (120, 56)

(120, 56)


In [2]:
caffeine_encoding

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
len(caffeine_smiles)

28

## Word embedding

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('./data/2020/df_smiles.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)
df.head()

Unnamed: 0,pdbcode,affinity,pocket_smiles
0,3zzf,0.4,CC(C)C[C@@H](C=O)NC(=O)[C@H](CO)NC(=O)[C@@H](N...
1,3gww,0.45,CC(C)C[C@H](N)C(=O)N[C@H](C=O)Cc1ccc(O)cc1.CC(...
2,1w8l,0.49,CC(C)C[C@@H](C=O)NC(=O)[C@@H](N)Cc1c[nH]c2cccc...
3,3fqa,0.49,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](C)...
4,1zsb,0.6,CC(C)C[C@@H](C=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@...


In [6]:
len(df.pocket_smiles[9000])

972

In [7]:
!pip install git+https://github.com/samoturk/mol2vec

Collecting git+https://github.com/samoturk/mol2vec
  Cloning https://github.com/samoturk/mol2vec to c:\users\мария\appdata\local\temp\pip-req-build-bqbnpuuu


  Running command git clone -q https://github.com/samoturk/mol2vec 'C:\Users\Мария\AppData\Local\Temp\pip-req-build-bqbnpuuu'
  fatal: unable to access 'https://github.com/samoturk/mol2vec/': Could not resolve host: github.com
ERROR: Command errored out with exit status 128: git clone -q https://github.com/samoturk/mol2vec 'C:\Users\Мария\AppData\Local\Temp\pip-req-build-bqbnpuuu' Check the logs for full command output.
You should consider upgrading via the 'C:\Users\Мария\Notebooks\course_work\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [9]:
from gensim.models import word2vec

In [10]:
# import rdkit/mol2vec/word2vec -----------------------------------------------
from rdkit import Chem
import mol2vec
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec

import numpy as np
import pandas as pd

#print(">>> create embedding from sentence ... ")
#df['embedding'] = [DfVec(x) for x in sentences2vec(df['sentence'], w2v_model)]

#print(">>> data columns = ", df.columns, "\n")

#df_mol2vec = np.array([x.vec for x in df['embedding']])
#df_mol2vec = pd.DataFrame(df_mol2vec)
#print(">>> df_mol2vec shape = ", df_mol2vec.shape)
#print(df_mol2vec)
#print()

In [11]:
len(df.pocket_smiles[0])

911

In [37]:
mol = Chem.MolFromSmiles(df.pocket_smiles[15])
sentence = MolSentence(mol2alt_sentence(mol, radius=1)) 

In [38]:
type(mol)

rdkit.Chem.rdchem.Mol

In [16]:
w2v_model = word2vec.Word2Vec.load('emb_model/model_300dim.pkl')

In [23]:
sentences2vec(sentence, w2v_model).shape

(724,)

In [21]:
tmp = sentences2vec(sentence, w2v_model)
tmp.shape

(724,)

In [68]:
embedding = DfVec(sentences2vec(sentence, w2v_model))

In [69]:
help(embedding)

Help on DfVec in module mol2vec.features object:

class DfVec(builtins.object)
 |  DfVec(vec)
 |  
 |  Helper class to store vectors in a pandas DataFrame
 |  
 |  Parameters  
 |  ---------- 
 |  vec: np.array
 |  
 |  Methods defined here:
 |  
 |  __init__(self, vec)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __len__(self)
 |  
 |  __repr__ = __str__(self)
 |  
 |  __str__(self)
 |      Return str(self).
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [29]:
print(">>> create mol from smiles ... ")
df['mol'] = df['pocket_smiles'].apply(lambda x: Chem.MolFromSmiles(x)) 

>>> create mol from smiles ... 


[17:06:46] Explicit valence for atom # 96 Mg, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 96 Mg, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 96 Mg, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 60 N, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 55 Ca, 4, is greater than permitted
[17:06:46] Explicit valence for atom # 48 Ca, 4, is greater than permitted
[17:06:46] Explicit valence for atom # 55 Ca, 4, is greater than permitted
[17:06:46] Explicit valence for atom # 97 N, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 342 N, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 119 N, 5, is greater than permitted
[17:06:46] Explicit valence for atom # 55 Ca, 4, is greater than permitted
[17:06:46] Explicit valence for atom # 18 Ca, 4, is greater than permitted
[17:06:46] Explicit valence for atom # 98 N, 5, is greater than permitted
[17:06:46] Explicit valence 

In [30]:
print(">>> create sentence from mol ... ")
df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], radius=1)), axis=1)

>>> create sentence from mol ... 


ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMorganFingerprint(NoneType, int)
did not match C++ signature:
    GetMorganFingerprint(class RDKit::ROMol mol, unsigned int radius, class boost::python::api::object invariants=[], class boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, bool useCounts=True, class boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False)

In [None]:
print(">>> load the word2vec model ... ")
w2v_model = word2vec.Word2Vec.load('model_300dim.pkl')

In [None]:
print(">>> create embedding from sentence ... ")
df['embedding'] = [DfVec(x) for x in sentences2vec(df['sentence'], w2v_model)]

In [None]:
print(">>> data columns = ", df.columns, "\n")

In [None]:
df_mol2vec = np.array([x.vec for x in df['embedding']])
df_mol2vec = pd.DataFrame(df_mol2vec)
print(">>> df_mol2vec shape = ", df_mol2vec.shape)
print(df_mol2vec)
print()