In [204]:
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from gensim.models import Word2Vec
from gensim.models import word2vec
from rdkit import Chem
import pandas as pd
import numpy as np
from keras.callbacks import ModelCheckpoint, Callback
import matplotlib.pyplot as plt

In [205]:
 df = pd.read_csv('/home/mamonteiro/source-code/Project-LEI/SIDER/sider.csv')    

In [206]:
df['smiles'][0:5]

0                                      C(CNCCNCCNCCN)N
1    CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...
2    CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...
3      CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34
4               C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O
Name: smiles, dtype: object

In [207]:
# Get SMILES from file
def getSMILES(filepath):
    df = pd.read_csv(filepath)    
    #smiles = list(df['SMILES or PubChem ID'].dropna())
    smiles_df = df[['smiles']].dropna()
    # some cases have 2 SMILES per compound, which I think are isomers (by looking at the molecular drawings)
    # keeping only the first SMILE string:
    #smiles = [x.split(';')[0] for x in smiles]
    return smiles_df

In [208]:
# Generate molecules from SMILES strings
def generateEmbeddings(smiles_df, trained_model):
    smiles = list(smiles_df['smiles'])
    smiles = [x.split(';')[0] for x in smiles]
    # SMILES to Mol
    molecules = [Chem.MolFromSmiles(x) for x in smiles]
    # Load previously trained mol2vec model
    model = Word2Vec.load(trained_model)
    # Convert molecules to sentences and then to embeddings
    sentences = [mol2alt_sentence(x, 1) for x in molecules]
    vectors = [DfVec(x) for x in sentences2vec(sentences, model, unseen='UNK')]
    vec_df = pd.DataFrame(data=np.array([x.vec for x in vectors]))
    vec_df.columns = ['mol2vec_' + str(x+1) for x in vec_df.columns.values]
    vec_df.index = smiles_df.index.values
    return pd.concat([smiles_df, vec_df], axis=1)

In [209]:
def createDataset(original_data, embeddings_df, output):
    original_df = pd.read_csv(original_data)
    df = original_df.merge(embeddings_df, how='outer', on="smiles")
    df.to_csv(output, index=False)    

In [210]:
smiles_from_side =getSMILES('/home/mamonteiro/source-code/Project-LEI/SIDER/sider.csv')
type(smiles_from_side)

pandas.core.frame.DataFrame

In [211]:
import os
 
dirpath = os.getcwd()
print("current directory is : " + dirpath)
foldername = os.path.basename(dirpath)
print("Directory name is : " + foldername)

current directory is : /home/mamonteiro/source-code/Project-LEI/SIDER
Directory name is : SIDER


In [212]:
emb_df = generateEmbeddings(smiles_from_side, '/home/mamonteiro/source-code/Project-LEI/mol2vec/examples/models/model_300dim.pkl')



In [213]:
emb_df.head()

Unnamed: 0,smiles,mol2vec_1,mol2vec_2,mol2vec_3,mol2vec_4,mol2vec_5,mol2vec_6,mol2vec_7,mol2vec_8,mol2vec_9,...,mol2vec_291,mol2vec_292,mol2vec_293,mol2vec_294,mol2vec_295,mol2vec_296,mol2vec_297,mol2vec_298,mol2vec_299,mol2vec_300
0,C(CNCCNCCNCCN)N,-0.990727,-1.723967,1.59608,0.336589,5.99587,1.602312,-7.89378,-0.770941,2.798226,...,1.845091,4.080578,5.290233,2.681949,-6.017433,0.732134,-0.16161,-7.355957,-6.514126,-4.416229
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0.596306,0.060384,-4.686247,4.381831,2.139633,-0.343262,-13.849467,0.780369,9.671047,...,-0.760953,9.614191,13.119958,-0.40857,-7.577562,-4.803534,-4.880173,-7.033062,-15.572207,-4.429869
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,1.854099,-4.819261,0.775984,3.286393,1.51957,-6.281527,-10.383826,5.000489,-0.742257,...,0.873612,15.422496,-1.34858,-1.571692,-17.224537,-6.963869,-12.192692,-3.912728,-8.017673,1.26565
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,2.140168,-5.169839,-0.039611,3.000724,0.787211,-6.880911,-10.38464,5.767844,-0.478307,...,0.627151,15.921182,-1.159732,-1.260444,-17.655016,-7.629984,-13.157971,-3.574889,-8.912206,1.509292
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,-0.227203,-2.800524,-1.099674,4.259158,-1.261016,-2.570959,-8.051775,1.025365,6.135835,...,-0.972749,9.117423,7.184863,-0.012285,-7.312431,-1.483516,-4.117091,-2.378627,-9.008883,-0.717169


In [214]:
emb_df.iloc[0][0]

'C(CNCCNCCNCCN)N'

In [215]:
createDataset('/home/mamonteiro/source-code/Project-LEI/SIDER/sider.csv', emb_df, '/home/mamonteiro/source-code/Project-LEI/SIDER/sider_embeddings.csv')

In [216]:
 df = pd.read_csv('/home/mamonteiro/source-code/Project-LEI/SIDER/sider_embeddings.csv')    

In [217]:
df.head()

Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,...,mol2vec_291,mol2vec_292,mol2vec_293,mol2vec_294,mol2vec_295,mol2vec_296,mol2vec_297,mol2vec_298,mol2vec_299,mol2vec_300
0,C(CNCCNCCNCCN)N,1,1,0,0,1,1,1,0,0,...,1.845092,4.080578,5.290233,2.681949,-6.017433,0.732134,-0.16161,-7.355957,-6.514126,-4.416229
1,CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...,0,1,0,0,1,1,1,0,0,...,-0.760953,9.614191,13.119958,-0.40857,-7.577562,-4.803534,-4.880173,-7.033062,-15.572207,-4.429869
2,CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...,0,1,0,1,1,0,1,0,1,...,0.873612,15.422496,-1.34858,-1.571692,-17.224537,-6.963869,-12.192692,-3.912728,-8.017673,1.26565
3,CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34,1,1,0,1,1,1,1,0,1,...,0.627151,15.921182,-1.159732,-1.260444,-17.655016,-7.629984,-13.157971,-3.574889,-8.912206,1.509292
4,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,1,1,0,1,1,1,1,0,1,...,-0.972749,9.117423,7.184863,-0.012285,-7.312431,-1.483516,-4.117091,-2.378627,-9.008883,-0.717169



# Multitask Networks On SIDER

In [240]:
import os
import deepchem as dc


current_dir = os.path.dirname(os.path.realpath("__file__"))
dataset_file = "medium_muv.csv.gz"


dataset = dc.utils.save.load_from_disk('/home/mamonteiro/source-code/Project-LEI/SIDER/sider_embeddings.csv')
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))

Columns of dataset: ['smiles' 'Hepatobiliary disorders' 'Metabolism and nutrition disorders'
 'Product issues' 'Eye disorders' 'Investigations'
 'Musculoskeletal and connective tissue disorders'
 'Gastrointestinal disorders' 'Social circumstances'
 'Immune system disorders' 'Reproductive system and breast disorders'
 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)'
 'General disorders and administration site conditions'
 'Endocrine disorders' 'Surgical and medical procedures'
 'Vascular disorders' 'Blood and lymphatic system disorders'
 'Skin and subcutaneous tissue disorders'
 'Congenital, familial and genetic disorders'
 'Infections and infestations'
 'Respiratory, thoracic and mediastinal disorders' 'Psychiatric disorders'
 'Renal and urinary disorders'
 'Pregnancy, puerperium and perinatal conditions'
 'Ear and labyrinth disorders' 'Cardiac disorders'
 'Nervous system disorders'
 'Injury, poisoning and procedural complications' 'mol2vec_1' 'mol2vec_2'
 'mol2vec

The dataset that we must manipulate is thhe follwoing one: dataset

In this dataset has as columns the SMILES
                               The side efects
                               and finally the embeddings
        
We need to split this dataset in order to provide this data to multitask model

In [241]:
#dataset.reset_index(drop=True)
#dataset = dataset.set_index('smiles')
#dataset.head(10)
#dataset.shape
cena_a_passar = dataset['smiles']

In [242]:
dataset=dataset.drop(['smiles'], axis=1)

In [243]:
dataset.head(2)

Unnamed: 0,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,...,mol2vec_291,mol2vec_292,mol2vec_293,mol2vec_294,mol2vec_295,mol2vec_296,mol2vec_297,mol2vec_298,mol2vec_299,mol2vec_300
0,1,1,0,0,1,1,1,0,0,0,...,1.845092,4.080578,5.290233,2.681949,-6.017433,0.732134,-0.16161,-7.355957,-6.514126,-4.416229
1,0,1,0,0,1,1,1,0,0,1,...,-0.760953,9.614191,13.119958,-0.40857,-7.577562,-4.803534,-4.880173,-7.033062,-15.572207,-4.429869


In [244]:
#sideEfects=
dataset.iloc[:,0:27].head()

Unnamed: 0,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,1,1,0,0,1,1,1,0,0,0,...,0,0,1,1,0,0,1,1,1,0
1,0,1,0,0,1,1,1,0,0,1,...,0,1,1,0,0,0,1,0,1,0
2,0,1,0,1,1,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,0
3,1,1,0,1,1,1,1,0,1,1,...,1,1,1,1,1,1,0,0,1,1
4,1,1,0,1,1,1,1,0,1,0,...,0,1,1,1,0,0,1,0,1,0


In [245]:
sideEfects= dataset.iloc[:,0:27]

In [246]:
sideEfects.head()

Unnamed: 0,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,...,"Congenital, familial and genetic disorders",Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications"
0,1,1,0,0,1,1,1,0,0,0,...,0,0,1,1,0,0,1,1,1,0
1,0,1,0,0,1,1,1,0,0,1,...,0,1,1,0,0,0,1,0,1,0
2,0,1,0,1,1,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,0
3,1,1,0,1,1,1,1,0,1,1,...,1,1,1,1,1,1,0,0,1,1
4,1,1,0,1,1,1,1,0,1,0,...,0,1,1,1,0,0,1,0,1,0


In [247]:
embeddings=dataset.iloc[:,28:]

In [248]:
embeddings.head()

Unnamed: 0,mol2vec_2,mol2vec_3,mol2vec_4,mol2vec_5,mol2vec_6,mol2vec_7,mol2vec_8,mol2vec_9,mol2vec_10,mol2vec_11,...,mol2vec_291,mol2vec_292,mol2vec_293,mol2vec_294,mol2vec_295,mol2vec_296,mol2vec_297,mol2vec_298,mol2vec_299,mol2vec_300
0,-1.723967,1.59608,0.336589,5.99587,1.602312,-7.89378,-0.770941,2.798226,-4.712134,0.880806,...,1.845092,4.080578,5.290233,2.681949,-6.017433,0.732134,-0.16161,-7.355957,-6.514126,-4.416229
1,0.060384,-4.686247,4.381831,2.139633,-0.343262,-13.849467,0.780369,9.671047,4.445226,-4.031947,...,-0.760953,9.614191,13.119958,-0.40857,-7.577562,-4.803534,-4.880173,-7.033062,-15.572207,-4.429869
2,-4.819261,0.775984,3.286393,1.51957,-6.281527,-10.383826,5.000489,-0.742258,0.57926,5.422467,...,0.873612,15.422496,-1.34858,-1.571692,-17.224537,-6.963869,-12.192692,-3.912728,-8.017673,1.26565
3,-5.169839,-0.039611,3.000724,0.787211,-6.880911,-10.38464,5.767844,-0.478307,0.19178,5.250709,...,0.627151,15.921182,-1.159732,-1.260444,-17.655016,-7.629984,-13.157971,-3.574889,-8.912206,1.509292
4,-2.800524,-1.099673,4.259158,-1.261016,-2.570959,-8.051775,1.025365,6.135835,2.401863,-2.238856,...,-0.972749,9.117423,7.184863,-0.012285,-7.312431,-1.483516,-4.117091,-2.378627,-9.008883,-0.717169


In [249]:
embeddings.isnull().any().any()

False

In [250]:
sideEfects.index

RangeIndex(start=0, stop=1427, step=1)

In [251]:
sideEfects.isnull().any().any()

False

In [252]:
NCA1 = 100
NCA2 = 100
DROPRATE = 0.2
EP = 50
BATCH_SIZE = 128
VAL_RATIO = 0.1
TEST_RATIO = 0.1

In [253]:
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split

In [254]:
from deepchem.data.datasets import NumpyDataset # import NumpyDataset

dataset_with_weights = NumpyDataset(sideEfects) # creates numpy dataset object


Iterating over NumpyDataset
¶
In order to iterate over NumpyDataset, we use itersamples method. 

We iterate over 4 quantities, namely X, y, w and ids. 

The first three quantities are the same as discussed above and ids is the id of the data instance. 

By default the id is given in order starting from 1


In [160]:
for x, y, w, id in dataset_with_weights.itersamples():
    print(x, y)

[-1.7239671e+00  1.5960803e+00  3.3658930e-01  5.9958700e+00
  1.6023117e+00 -7.8937800e+00 -7.7094054e-01  2.7982256e+00
 -4.7121340e+00  8.8080620e-01  5.7754984e+00 -2.8670790e+00
  3.3877957e+00 -7.7692330e+00  3.2428293e+00  4.6839010e+00
 -9.3364280e+00 -2.8563542e+00  5.9865130e+00 -1.3733228e+00
  3.9655754e-01  7.4127490e+00  4.9267488e+00 -3.4972591e+00
 -1.0492841e+00 -5.3407946e+00 -1.1383237e+01 -1.6772937e+00
 -8.2752690e-01  1.6313319e+00 -4.6640115e+00 -9.8346674e-01
 -4.0846690e+00  2.2031412e+00 -1.2725309e+00 -4.2912607e+00
  4.2230990e+00  5.3510220e+00  7.3572583e+00  3.7113924e+00
 -3.6405510e-01 -1.6309372e+00 -3.0457702e+00 -2.6984859e+00
 -1.1270791e+00  3.1987698e+00  8.7437280e+00 -2.8527646e+00
  3.8667452e+00  1.9813349e+00 -7.3851776e+00 -8.0614710e-01
 -4.6606307e+00 -4.7950478e+00 -4.4748870e+00  3.2990320e-01
 -8.0016080e-02 -2.5293500e+00 -9.2334497e-01 -3.7377100e-01
 -4.5521680e+00 -2.8777063e+00  2.5117996e+00 -8.0622500e+00
 -2.7247202e+00  1.18204

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




[-6.69363360e+00 -6.16778950e+00  6.29378900e+00 -2.20340430e-01
 -2.64833350e+00 -1.10143650e+01 -2.41853170e+00  2.23784500e+00
  2.28047230e+00 -4.44420960e+00  8.96244500e-01 -2.30968170e+00
  2.53615760e+00 -8.02255000e+00  8.25638300e-02  1.11695440e+00
 -4.84331370e+00 -5.09367850e+00  6.92474700e+00  3.85821530e+00
  6.74791530e+00  1.52393870e+01  7.54998600e+00 -8.37157100e+00
  2.68138800e+00 -1.07439920e+00 -8.50890300e+00 -3.74774270e+00
 -9.48408600e-01  1.45844760e+01 -8.39220900e+00 -6.63747930e+00
 -7.97054620e+00  4.11734250e-01 -2.49382380e+00  3.33114040e-02
 -6.07703030e-02  1.27854130e+01  6.40987350e+00 -3.25526700e+00
 -2.55945200e+00 -4.11971300e+00  6.97271050e-01 -1.19172680e+01
  6.49486540e+00  1.49410080e+00  9.91092500e+00 -8.01281700e+00
  7.22966500e+00  7.55563400e-01 -7.64813470e+00 -4.03570270e+00
 -6.28120040e+00 -1.20058110e+01 -6.63841340e+00 -8.46384000e+00
  2.18185020e+00  7.13027760e+00  5.92283770e+00 -2.98649760e+00
 -9.33139800e+00 -6.0551

In [176]:
teste = sideEfects

In [197]:
for x, y, w, id in dataset_with_weights.itersamples():
    for index, row in teste.iterrows():
        id = row['teste']

KeyboardInterrupt: 

You can also extract the ids by dataset.ids. This would return a numpy array consisting of the ids of the data instances.

dataset.ids

In [238]:
dataset_with_weights.X # Extracts the data (X) from the NumpyDataset Object


array([[1, 1, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 1],
       [0, 1, 0, ..., 1, 1, 1],
       [1, 1, 0, ..., 1, 1, 1]])

In [81]:
dataset_with_weights

<deepchem.data.datasets.NumpyDataset at 0x7faf99e2bcc0>

In [164]:
from deepchem.splits.splitters import IndexSplitter
splitter=IndexSplitter()
train_data,valid_data,test_data=splitter.split(dataset_with_weights,frac_train=0.7,frac_valid=0.2,frac_test=0.1)


In [165]:
train_data=[i for i in train_data]
valid_data=[i for i in valid_data]
test_data=[i for i in test_data]

In [166]:
len(train_data),len(valid_data),len(test_data)

(998, 286, 143)

In [76]:
dataset_with_weights

<deepchem.data.datasets.NumpyDataset at 0x7faf99e2bcc0>

In [255]:
train_embeddings_dataset = dc.data.NumpyDataset(embeddings,
                                                dataset_with_weights.y,
                                                dataset_with_weights.w,
                                                cena_a_passar)

In [172]:
valid_embeddings_dataset = dc.data.NumpyDataset(valid_data,
                                                dataset_with_weights.y,
                                                dataset_with_weights.w,
                                                dataset_with_weights.ids)

In [None]:
classifier = dc.models.MultitaskClassifier(n_tasks=27,
                                           n_features=200,
                                           layer_sizes=[1])
classifier.fit(train_embeddings_dataset, nb_epoch=1)

In [None]:
valid_embeddings = model.predict_embeddings(valid_smiles)
valid_embeddings_dataset = dc.data.NumpyDataset(valid_embeddings,
                                                valid_dataset.y,
                                                valid_dataset.w,
                                                valid_dataset.ids)