### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from urllib.request import urlopen
from bs4 import BeautifulSoup

import json
import pickle
import time

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem

import pubchempy as pcp

### Read data

In [2]:
df = pd.read_excel('pesticides02.xlsx')

In [3]:
df.sample(5).head()

Unnamed: 0,PesticideID,CAS,Name,Warning
1479,Pesticide1480,66332-96-5,flutolanil,False
2096,Pesticide2097,566191-89-7,aminopyralid-tripromine,False
900,Pesticide901,14816-18-3,phoxim,False
1115,Pesticide1116,28730-17-8,methfuroxam,False
1692,Pesticide1693,89269-64-7,ferimzone,False


### Retrieve SMILES, based on CAS registry numbers

#### Using the National Cancer Institute's CADD Group cheminformatics tools

In [4]:
t0 = time.time()

for index, row in df.iterrows():
    try:
        url = "https://cactus.nci.nih.gov/chemical/structure/"+row['CAS']+"/smiles"
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html')
        smi = soup.get_text()
        df.loc[index, 'SMILES'] = smi
    except:
        df.loc[index, 'SMILES'] = 'None'
        
t1 = time.time()
elapsed = t1-t0
print(elapsed)

1057.4930028915405


In [5]:
df.sample(5).head()

Unnamed: 0,PesticideID,CAS,Name,Warning,SMILES
26,Pesticide27,57-74-9,chlordane,False,ClC1CC2C(C1Cl)C3(Cl)C(=C(Cl)C2(Cl)C3(Cl)Cl)Cl
93,Pesticide94,85-00-7,diquat dibromide,False,[Br-].[Br-].C1C[n+]2ccccc2c3cccc[n+]13
235,Pesticide236,135-58-0,mesulfen,False,Cc1ccc2Sc3ccc(C)cc3Sc2c1
1847,Pesticide1848,122453-73-0,chlorfenapyr,False,CCOCn1c(c2ccc(Cl)cc2)c(C#N)c(Br)c1C(F)(F)F
1735,Pesticide1736,99485-76-4,cumyluron,False,


#### Using the National Library of Medicine's data tools

In [6]:
t0 = time.time()

for index, row in df.iterrows():
    try:
        url = "https://chem.nlm.nih.gov/api/data/structure/exact/"+row['CAS']+"?data=smiles"
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html')
        d = json.loads(soup.get_text())
        df.loc[index, 'nlmSMILES'] = d['results'][0]['structureDetails']['s']
    except:
        df.loc[index, 'nlmSMILES'] = 'None'
        
t1 = time.time()
elapsed = t1-t0
print(elapsed)

504.8670017719269


In [7]:
df.sample(5).head()

Unnamed: 0,PesticideID,CAS,Name,Warning,SMILES,nlmSMILES
987,Pesticide988,21564-17-0,benthiazole,False,N#CSCSc1sc2ccccc2n1,N#CSCSc1nc2ccccc2s1
89,Pesticide90,83-59-0,propyl isome,False,CCCOC(=O)C1C(C)Cc2cc3OCOc3cc2C1C(=O)OCCC,CCCOC(=O)C1c2cc3c(cc2CC(C1C(=O)OCCC)C)OCO3
2002,Pesticide2003,180409-60-3,cyflufenamid,False,,Fc1ccc(c(\C(=N\OCC2CC2)\NC(=O)Cc3ccccc3)c1F)C(...
878,Pesticide879,13684-63-4,phenmedipham,False,COC(=O)Nc1cccc(OC(=O)Nc2cccc(C)c2)c1,Cc1cccc(c1)NC(=O)Oc2cccc(c2)NC(=O)OC
1764,Pesticide1765,104206-82-8,mesotrione,False,C[S](=O)(=O)c1ccc(C(=O)C2C(=O)CCCC2=O)c(c1)[N+...,C1(C(C(CCC1)=O)C(c1c(cc(cc1)S(=O)(=O)C)[N+](=O...


#### Using PubChem's toolkit

In [8]:
t0 = time.time()

for index, row in df.iterrows():
    try:
        results = pcp.get_compounds(row['CAS'], 'name')
        df.loc[index, 'PubChemSMILES'] = results[0].canonical_smiles
    except:
        df.loc[index, 'PubChemSMILES'] = 'None'
        
t1 = time.time()
elapsed = t1-t0
print(elapsed)

913.698602437973


In [9]:
df.sample(5).head()

Unnamed: 0,PesticideID,CAS,Name,Warning,SMILES,nlmSMILES,PubChemSMILES
2044,Pesticide2045,240494-71-7,epsilon-metofluthrin,False,,,CC=CC1C(C1(C)C)C(=O)OCC2=C(C(=C(C(=C2F)F)COC)F)F
1577,Pesticide1578,76280-91-6,tecloftalam,False,OC(=O)c1c(Cl)c(Cl)c(Cl)c(Cl)c1C(=O)Nc2cccc(Cl)...,c1(c(c(c(Cl)c(c1Cl)Cl)Cl)C(O)=O)C(Nc1c(c(ccc1)...,C1=CC(=C(C(=C1)Cl)Cl)NC(=O)C2=C(C(=C(C(=C2Cl)C...
2147,Pesticide2148,951659-40-8,flupyradifurone,False,,c1cc(ncc1CN(CC(F)F)C2=CC(=O)OC2)Cl,C1C(=CC(=O)O1)N(CC2=CN=C(C=C2)Cl)CC(F)F
339,Pesticide340,572-48-5,coumithoate,False,CCO[P](=S)(OCC)Oc1ccc2C3=C(CCCC3)C(=O)Oc2c1,CCOP(=S)(OCC)Oc1ccc2C3=C(CCCC3)C(=O)Oc2c1,CCOP(=S)(OCC)OC1=CC2=C(C=C1)C3=C(CCCC3)C(=O)O2
463,Pesticide464,1786-03-4,"naphthylindane-1,3-diones",False,O=C1C(C(=O)c2ccccc12)c3cccc4ccccc34,O=C1C(C(=O)c2ccccc12)c3cccc4ccccc34,C1=CC=C2C(=C1)C=CC=C2C3C(=O)C4=CC=CC=C4C3=O


In [10]:
for index, row in df.iterrows():
    if row['SMILES'] == 'None': df.loc[index, 'SMILES'] = df.loc[index, 'nlmSMILES']
        
for index, row in df.iterrows():
    if row['SMILES'] == 'None': df.loc[index, 'SMILES'] = df.loc[index, 'PubChemSMILES']

In [11]:
df2 = df[['PesticideID', 'CAS', 'Name', 'Warning', 'SMILES']]

### Remove entries that do not have SMILES strings

In [12]:
df2 = df2[df2['SMILES'] != 'None']

In [13]:
df2.shape

(2146, 5)

### Use SMILES string to generate molecule record

In [14]:
for index, row in df2.iterrows():
    m = Chem.MolFromSmiles(row['SMILES'])
    if m is not None:
        df2.loc[index, 'ROMol'] = m
    else:
        df2.loc[index, 'ROMol'] = 'None'

In [15]:
df3 = df2[df2['ROMol'] != 'None']

In [16]:
df3.shape

(2130, 6)

In [17]:
df3.sample(5).head()

Unnamed: 0,PesticideID,CAS,Name,Warning,SMILES,ROMol
1190,Pesticide1191,35554-44-0,imazalil,False,Clc1ccc(C(Cn2ccnc2)OCC=C)c(Cl)c1,<rdkit.Chem.rdchem.Mol object at 0x0000000013B...
160,Pesticide161,107-18-6,allyl alcohol,False,OCC=C,<rdkit.Chem.rdchem.Mol object at 0x0000000013D...
1756,Pesticide1757,103782-08-7,allosamidin,False,CN(C)C1=N[C@@H]2[C@@H](O)[C@H](O[C@@H]3O[C@H](...,<rdkit.Chem.rdchem.Mol object at 0x0000000013B...
1932,Pesticide1933,143807-66-3,chromafenozide,False,Cc1cc(C)cc(c1)C(=O)N(NC(=O)c2ccc3OCCCc3c2C)C(C...,<rdkit.Chem.rdchem.Mol object at 0x0000000013B...
1494,Pesticide1495,67564-91-4,fenpropimorph,False,CC(CN1C[C@H](C)O[C@H](C)C1)Cc2ccc(cc2)C(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x0000000013B...


### Save this dataset

In [18]:
with open('pesticides.pickle', 'wb') as f:
    pickle.dump(df3, f, pickle.HIGHEST_PROTOCOL)

#### Reload dataset

In [2]:
file = open('pesticides.pickle', 'rb')
df3 = pickle.load(file)
file.close()

### Calculate Morgan fingerprints (radius = 2)

In [3]:
fps = []
for index, row in df3.iterrows():
    fps.append(AllChem.GetMorganFingerprintAsBitVect(row['ROMol'], 2))

In [4]:
# the list for the dataframe
qu, ta, sim = [], [], []

In [5]:
# compare all fp pairwise without duplicates
for n in range(len(fps)-1): # -1 so the last fp will not be used
    s = DataStructs.BulkTanimotoSimilarity(fps[n], fps[n+1:]) # +1 compare with the next to the last fp
#     print(pesticides.loc[n, 'Name'], pesticides.loc[n+1:, 'Name']) # comparison pairs
#     # collect the SMILES and values
    for m in range(len(s)):
#         qu.append(pesticides.loc[n, 'Name'])
#         ta.append(pesticides.loc[n+1:, 'Name'][m])
        sim.append(s[m])
# print()

In [6]:
DataStructs.FingerprintSimilarity(fps[0], fps[1])

0.018518518518518517

In [7]:
sim[0]

0.018518518518518517

In [6]:
# columns = list(df3['Name'])
sim_matrix = pd.DataFrame()

In [7]:
t0 = time.time()

q = 0
for k in range(0, len(fps)):
    sim_matrix.loc[k, k] = 1
for i in range(0, (len(fps)-1)):
    for j in range((i+1), len(fps)):
        sim_matrix.loc[i, j] = sim[q]
        sim_matrix.loc[j, i] = sim[q]
        if q % 5000 == 0: print(q)
        q = q + 1
            
print(q)

t1 = time.time()
elapsed = t1-t0
print(elapsed)

# q = 0
# for k in range(0, 5):
#     sim_matrix2.loc[k, k] = 1
# for i in range(0, (5-1)):
#     for j in range((i+1), 5):
#         sim_matrix2.loc[i, j] = sim[q]
#         sim_matrix2.loc[j, i] = sim[q]
#         if q % 1000 == 0: print(q)
#         q = q + 1
            
# print(q)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000
450000
455000
460000
465000
470000
475000
480000
485000
490000
495000
500000
505000
510000
515000
520000
525000
530000
535000
540000
545000
550000
555000
560000
565000
570000
575000
580000
585000
590000
595000
600000
605000
610000
615000
620000
625000
630000
635000
640000
645000
650000
655000
660000
665000
670000
675000
680000
685000
690000
695000
700000
705000
710000
715000
720000
725000
730

In [9]:
with open('pesticidesSimilarityMatrix02.pickle', 'wb') as f:
    pickle.dump(sim_matrix, f, pickle.HIGHEST_PROTOCOL)

In [61]:
sim[2267384]

0.14423076923076922

In [15]:
len(fps)

2130

In [34]:
columns = list(df3['Name'])

In [41]:
sim_matrix = pd.DataFrame(columns = columns)

In [34]:
df3['Name'].to_csv('names.csv')

  """Entry point for launching an IPython kernel.


### Generate a distance matrix, based on the Tanimoto similarity calculated between each pair of molecules

In [49]:
# generate distance matrix
dist_matrix = []
num_fps = len(fps)
for i in range(1, num_fps):
    similarities = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
    dist_matrix.extend([1-x for x in similarities])

In [69]:
len(dist_matrix)

2267385

In [75]:
def utri2mat(utri):
    n = int(-1 + np.sqrt(1 + 8*len(utri))) // 2
    iu1 = np.triu_indices(n)
    ret = np.empty((n, n))
    ret[iu1] = utri
    ret.T[iu1] = utri
    return ret

In [76]:
simMatrix = utri2mat(dist_matrix)

In [77]:
simMatrix.shape

(2129, 2129)

In [82]:
similarity = pd.DataFrame(simMatrix)

In [83]:
similarity.to_csv('PesticideSimilarityMatrix.csv')