In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install git+https://github.com/samoturk/mol2vec

In [None]:
!pip install rdkit-pypi



In [None]:
!pip install gensim



In [None]:
from rdkit import Chem
import mol2vec
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec
from gensim.models import word2vec

import numpy as np
import pandas as pd

In [79]:
df = pd.read_csv('/content/drive/MyDrive/Course_work/notebooks/data/df_smiles.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)
df.head()

Unnamed: 0,pdbcode,affinity,pocket_smiles
0,3zzf,0.4,CC(C)C[C@@H](C=O)NC(=O)[C@H](CO)NC(=O)[C@@H](N...
1,3gww,0.45,CC(C)C[C@H](N)C(=O)N[C@H](C=O)Cc1ccc(O)cc1.CC(...
2,1w8l,0.49,CC(C)C[C@@H](C=O)NC(=O)[C@@H](N)Cc1c[nH]c2cccc...
3,3fqa,0.49,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](C)...
4,1zsb,0.6,CC(C)C[C@@H](C=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@...


In [None]:
mol_list = [Chem.MolFromSmiles(el) for el in df.pocket_smiles]
len(mol_list)

19413

In [None]:
#sentence = MolSentence(mol2alt_sentence(mol, radius=1)) 
sentence_list = []
to_delete_list = []
i = 0

for mol in mol_list:
  if type(mol) == Chem.rdchem.Mol:
    sentence_list.append(MolSentence(mol2alt_sentence(mol, radius=1)))
  else:
    to_delete_list.append(i)
  i += 1

len(to_delete_list)

778

In [None]:
w2v_model = word2vec.Word2Vec.load('/content/drive/MyDrive/Course_work/notebooks/data/model_300dim.pkl')

In [None]:
def sentences2vec(sentences, model, unseen=None):
    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.
    
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032

    Returns
    -------
    np.array
    """
    keys = set(model.wv.vocab.keys())
    vec = []
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    i = 0

    for sentence in sentences:
      print(i)
      if unseen:
          vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys
                      else unseen_vec for y in sentence]))
      else:
          vec.append(sum([model.wv.get_vector(y) for y in sentence 
                          if y in set(sentence) & keys]))
      i += 1
    return np.array(vec)

In [None]:
s2v = sentences2vec(pd.Series(sentence_list), w2v_model)

In [None]:
embedding_list =  [] 
i = 0

for x in s2v:
  print(i)
  embedding_list.append(DfVec(x))
  i += 1

In [None]:
embedding_vectors = np.array([x.vec for x in embedding_list])

df_mol2vec = pd.DataFrame(embedding_vectors)
df_mol2vec.shape

(18635, 300)

In [None]:
df_mol2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-33.215519,-4.911911,-58.069412,-21.501129,86.036827,-48.711243,-163.092056,-56.392216,177.129028,-0.203314,...,-2.319795,88.283302,118.411171,87.596359,-135.841919,-56.251205,12.59845,-148.720886,-178.943771,-31.27887
1,-49.548927,-5.533656,-78.656738,11.228692,96.537285,-45.226391,-198.04187,-50.896503,194.372955,13.70278,...,-29.208727,112.017204,160.358704,97.121597,-164.787857,-62.422604,18.535614,-147.918762,-208.952576,-46.084557
2,-23.400223,-7.932979,-35.621426,-2.120256,59.704235,-22.519524,-114.470078,-34.282997,111.387978,7.034303,...,-12.122244,69.097427,94.777184,61.178638,-83.110741,-31.502813,7.987552,-95.268074,-120.7575,-28.159447
3,-26.764908,-7.9374,-49.278858,-9.648705,69.367477,-31.425886,-146.403458,-37.342216,129.867706,14.527238,...,-16.427969,85.094315,105.859848,69.64447,-126.765694,-49.138592,1.389015,-115.341492,-158.340179,-35.511227
4,-33.097664,-24.753981,-71.645752,17.423645,76.952316,-45.259354,-184.551147,-47.502502,176.079025,4.592343,...,-17.541918,123.08963,142.476715,75.984627,-151.256699,-62.206776,0.313482,-147.893433,-202.49968,-42.562164


In [85]:
df.drop(index = to_delete_list, axis = 0, inplace = True)
df.shape

(18635, 3)

In [91]:
df.reset_index(drop=True, inplace = True)

In [92]:
df.affinity.isna().sum()

0

In [93]:
df_mol2vec['pdbcode'] = df.pdbcode
df_mol2vec['affinity'] = df.affinity

In [94]:
df_mol2vec.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,pdbcode,affinity
0,-33.215519,-4.911911,-58.069412,-21.501129,86.036827,-48.711243,-163.092056,-56.392216,177.129028,-0.203314,...,118.411171,87.596359,-135.841919,-56.251205,12.59845,-148.720886,-178.943771,-31.27887,3zzf,0.4
1,-49.548927,-5.533656,-78.656738,11.228692,96.537285,-45.226391,-198.04187,-50.896503,194.372955,13.70278,...,160.358704,97.121597,-164.787857,-62.422604,18.535614,-147.918762,-208.952576,-46.084557,3gww,0.45
2,-23.400223,-7.932979,-35.621426,-2.120256,59.704235,-22.519524,-114.470078,-34.282997,111.387978,7.034303,...,94.777184,61.178638,-83.110741,-31.502813,7.987552,-95.268074,-120.7575,-28.159447,1w8l,0.49
3,-26.764908,-7.9374,-49.278858,-9.648705,69.367477,-31.425886,-146.403458,-37.342216,129.867706,14.527238,...,105.859848,69.64447,-126.765694,-49.138592,1.389015,-115.341492,-158.340179,-35.511227,3fqa,0.49
4,-33.097664,-24.753981,-71.645752,17.423645,76.952316,-45.259354,-184.551147,-47.502502,176.079025,4.592343,...,142.476715,75.984627,-151.256699,-62.206776,0.313482,-147.893433,-202.49968,-42.562164,1zsb,0.6


In [95]:
df_mol2vec.affinity.isna().sum()

0

In [96]:
df_mol2vec.to_csv('/content/drive/MyDrive/Course_work/notebooks/data/df_mol2vec.csv')