In [1]:
# You need to be in this venv .\venv\Scripts\Activate.ps1

## Load the data

1. Loads data from pyTDC 'ADME' 
2. Loads data from drug bank
3. loads data from bio snap


In [2]:
from tdc.single_pred import ADME
import pandas as pd
data = ADME(name = 'CYP2C19_Veith') # this load the datset. 

split = data.get_split()

Found local copy...
Loading...
Done!


In [3]:

df = pd.read_csv('data\cyp2c19_veith.tab', sep='\t')
df

Unnamed: 0,Drug_ID,Drug,Y
0,6602638.0,CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl,0
1,644675.0,CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1,1
2,644851.0,Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1,1
3,644890.0,COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1,1
4,644968.0,COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1,1
...,...,...,...
12660,16758814.0,CCN1C(=O)[C@@H]2[C@@H](CC[C@@H]3C(=O)C=C[C@@H]...,0
12661,16758816.0,O=C1[C@H]2CC=C3[C@@H]([C@H](O)[C@H]4O[C@@H]4C3...,0
12662,16758817.0,CCN1C(=O)[C@H]2CC=C3[C@@H]([C@H](O)[C@H]4O[C@@...,1
12663,16758818.0,C[C@H](c1ccccc1)N1C(=O)[C@@H]2[C@@H](CC[C@@H]3...,0


In [4]:
cleaned_df = df[['Drug', 'Y']]
cleaned_df.columns = ['SMILE','Target']
print(cleaned_df.head())
print(cleaned_df.shape)
cleaned_df.to_csv('data\cleaned_raw_data.csv',index=False)


                                             SMILE  Target
0           CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl       0
1       CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1       1
2                 Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1       1
3  COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1       1
4             COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1       1
(12665, 2)


### data/cleaned_raw_data.csv is the all of the SMILE, target Pairs. 

# Load a bunch of smiles to a .csv file that are similar to the training dataset. 

https://www.ebi.ac.uk/chembl/api/data/similarity/CN1C(=O)C=C(c2cccc(Cl)c2)c3cc(ccc13)[C@@](N)(c4ccc(Cl)cc4)c5cncn5C/80 
https://chembl.gitbook.io/chembl-interface-documentation/web-services/chembl-data-web-services

In [5]:
import requests
import json

call = f'https://www.ebi.ac.uk/chembl/api/data/similarity/CN1C(=O)C=C(c2cccc(Cl)c2)c3cc(ccc13)[C@@](N)(c4ccc(Cl)cc4)c5cncn5C/80?format=json'

response = requests.get(call)

In [6]:
resp_dict =json.loads(response.text)
resp_dict.keys()

dict_keys(['molecules', 'page_meta'])

In [7]:
molecules = resp_dict['molecules']
print(type(molecules))
print(len(molecules))

for i in molecules[:1]:
    print(type(molecules[0]))
    print(molecules[0].keys())

<class 'list'>
8
<class 'dict'>


In [8]:
molecules[0]['molecule_structures']['canonical_smiles']

'Cn1cncc1[C@](N)(c1ccc(Cl)cc1)c1ccc2c(c1)c(-c1cccc(Cl)c1)cc(=O)n2C'

In [9]:
def get_similar_smiles_from_chembl(smile:str, similarity_threshold=70):
    """
        Returns some number of a smile  for each molecule that is similar to smile that is above a similarity_threshold% tanimoto similarity cut off.
        Based on the molecules in the chembl database
    """
    try:
        call = f'https://www.ebi.ac.uk/chembl/api/data/similarity/{smile}/{similarity_threshold}?format=json'
        response = requests.get(call)
        molecules = json.loads(response.text)['molecules']
        similar_smile_strings = [drug['molecule_structures']['canonical_smiles'] for drug in molecules]
        return similar_smile_strings
    except:
        print('Request failed: error code: ' +str(response.status_code))
        print(smile)
        print(call)
        return []


In [10]:
training_df = cleaned_df.loc[:10000,:] # this is about 80% of the data reserved for training. The other 20% is not included. You might want to random shuffle it later
validaiton_df = cleaned_df.loc[10000:,:]
training_df.to_csv('data\in_training_data.csv',index=False)
validaiton_df.to_csv('data\in_validation_data.csv',index=False)


# This will go through all of the smiles in the entire dataset

To avoid leakage you might want to break this up into training and target datasets at this point so you don't let the data in the validation set overwhelm it.

In [11]:
train_df  = pd.read_csv('data\in_training_data.csv')
print(train_df.head())
print(train_df.shape)


                                             SMILE  Target
0           CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl       0
1       CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1       1
2                 Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1       1
3  COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1       1
4             COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1       1
(10001, 2)


In [12]:
smile_similar_pairs = [(smile, get_similar_smiles_from_chembl(smile)) for smile in list(train_df['SMILE'])]
# Creat a list of tupels of smile, (list) the smiles that are similar

# This should take a few minutes to run. using threshold of 70< I choose this arbritarily


Request failed: error code: 404
CCOC(=O)CSC1=C(C#N)C(C)C2=C(CCCC2=O)N1
https://www.ebi.ac.uk/chembl/api/data/similarity/CCOC(=O)CSC1=C(C#N)C(C)C2=C(CCCC2=O)N1/70?format=json
Request failed: error code: 404
CCSC1=C(C#N)C2(CCCCC2)C(C#N)=C(N)N1
https://www.ebi.ac.uk/chembl/api/data/similarity/CCSC1=C(C#N)C2(CCCCC2)C(C#N)=C(N)N1/70?format=json
Request failed: error code: 404
CN(CCC#N)CC(=O)Nc1ccc(NC(=O)CN(C)CCC#N)cc1
https://www.ebi.ac.uk/chembl/api/data/similarity/CN(CCC#N)CC(=O)Nc1ccc(NC(=O)CN(C)CCC#N)cc1/70?format=json
Request failed: error code: 404
N#CC1=C(N)OC2=C(C(=O)CCC2)C12CCC1(CC2)OCCO1
https://www.ebi.ac.uk/chembl/api/data/similarity/N#CC1=C(N)OC2=C(C(=O)CCC2)C12CCC1(CC2)OCCO1/70?format=json
Request failed: error code: 404
N#C/C(=C/N1CCOCC1)c1ccccc1
https://www.ebi.ac.uk/chembl/api/data/similarity/N#C/C(=C/N1CCOCC1)c1ccccc1/70?format=json
Request failed: error code: 404
CCn1c(SCc2ccc(C#N)cc2)nnc1-c1ccc(S(=O)(=O)N2CCCCC2)cc1
https://www.ebi.ac.uk/chembl/api/data/similarity/CCn1c(

In [15]:
# there is a much more elegant way to do this since you only need to write them to the file
with open('data/similar_smiles.csv','x') as f:
    for smile in list(train_df['SMILE']):
        smilar = get_similar_smiles_from_chembl(smile)
        f.writelines(smilar)
        print(f'Wrote {len(smilar)} lines')


Wrote 8 lines
Wrote 4 lines
Wrote 1 lines
Wrote 5 lines
Wrote 1 lines
Wrote 5 lines
Wrote 1 lines
Wrote 8 lines
Wrote 2 lines
Wrote 0 lines
Wrote 4 lines
Wrote 6 lines
Wrote 0 lines
Wrote 2 lines
Wrote 4 lines
Wrote 2 lines
Wrote 1 lines
Wrote 5 lines
Request failed: error code: 404
CCOC(=O)CSC1=C(C#N)C(C)C2=C(CCCC2=O)N1
https://www.ebi.ac.uk/chembl/api/data/similarity/CCOC(=O)CSC1=C(C#N)C(C)C2=C(CCCC2=O)N1/70?format=json
Wrote 0 lines
Wrote 2 lines
Wrote 1 lines
Request failed: error code: 404
CCSC1=C(C#N)C2(CCCCC2)C(C#N)=C(N)N1
https://www.ebi.ac.uk/chembl/api/data/similarity/CCSC1=C(C#N)C2(CCCCC2)C(C#N)=C(N)N1/70?format=json
Wrote 0 lines
Wrote 1 lines
Wrote 3 lines
Wrote 2 lines
Wrote 3 lines
Wrote 4 lines
Wrote 5 lines
Wrote 1 lines
Wrote 1 lines
Wrote 3 lines
Wrote 1 lines
Wrote 7 lines
Wrote 2 lines
Wrote 1 lines
Wrote 10 lines
Wrote 3 lines
Wrote 8 lines
Wrote 5 lines
Wrote 2 lines
Wrote 2 lines
Wrote 8 lines
Wrote 2 lines
Wrote 3 lines
Wrote 7 lines
Wrote 4 lines
Wrote 1 lines

## Save a bunch of similar smiles to the the data/similar_smiles.csv file. You will use this to extract common patterns to generate a dictionary for 1 hot feature encoding.