### Notebook used to generate a Smi2Vec model from a corpus of SMI strings

In [None]:
import pandas as pd
import numpy as np
import gensim 
from gensim.models import Word2Vec 
import random 
from sklearn.decomposition import IncrementalPCA
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
import os 
import random
import seaborn as sns 


class SPVec:

    def __init__(self,filename):
        self.filename = filename
   
    def read_data(self):
        data=pd.read_csv(self.filename)
        return data

    def word2vec(self,dims,window_size,negative_size):
        word_vec = pd.DataFrame()
        dictionary=[]
        Index = []
        data=self.read_data()
        texts = [[word for word in re.findall(r'.{3}',document)] for document in list(data)]
        model = Word2Vec(texts,size=dims,window=window_size,min_count=1,negative=negative_size,sg=1,sample=0.001,hs=1,workers=4)
        vectors = pd.DataFrame([model[word] for word in (model.wv.vocab)])
        vectors['Word'] = list(model.wv.vocab)

        for i in range(len(data)):
            Index.append(i)
        # Word segmentation
        for i in range(len(texts)):
            i_word=[]         
            for w in range(len(texts[i])):
                i_word.append(Index[i])    
            dictionary.extend(i_word)
        word_vec['Id'] = dictionary
        
        # word vectors generation
        dictionary=[]
        for i in range(len(texts)):
            i_word=[]         
            for w in range(len(texts[i])):
                i_word.append(texts[i][w])    
            dictionary.extend(i_word)
        word_vec['Word'] = dictionary
        del dictionary,i_word
        word_vec = word_vec.merge(vectors,on='Word', how='left')
        #word_vec = word_vec.drop('Word',axis=1)
        word_vec.columns = ['Id']+['word']+["vec_{0}".format(i) for i in range(0,dims)]

        return word_vec

    #Molecular Structure and Protein Sequence Representation
    def feature_embeddings(self,dims):
        word_vec = self.word2vec(dims,window_size,negative_size)
        word_vec=word_vec.drop('Word',axis=1)
        name = ["vec_{0}".format(i) for i in range(0,dims)]
        feature_embeddings = pd.DataFrame(word_vec.groupby(['Id'])[name].agg('mean')).reset_index()
        feature_embeddings.columns=["Index"]+["mean_ci_{0}".format(i) for i in range(0,dims)]
        return feature_embeddings

In [2]:
with open("mols.smi", "r") as f:
    l = f.readlines()

In [9]:
l[0].strip()

'CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(O)=O)NC(=O)CNC(=O)[C@H](CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)CC1=CC=CC=C1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CC(C)C)C(O)=O'

In [10]:
data = [x.strip() for x in l]
print(len(data))

11172


In [11]:
import re
dims,window_size,negative_size = 100,6,12
texts = [[word for word in re.findall(r'.{3}',document)] for document in list(data)]
model = Word2Vec(texts,size=dims,window=window_size,min_count=1,negative=negative_size,sg=1,sample=0.001,hs=1,workers=4)

In [12]:
model.save("word2vec.model")

In [7]:
"CC[" in model

  "CC[" in model


True