In [16]:
# You need to be in this venv .\venv\Scripts\Activate.ps1

## Load the data

1. Loads data from pyTDC 'ADME' 
2. Loads data from drug bank
3. loads data from bio snap


In [17]:
from tdc.single_pred import ADME
import pandas as pd
data = ADME(name = 'CYP2C19_Veith') # this load the datset. 

split = data.get_split()

Found local copy...
Loading...
Done!


In [18]:

df = pd.read_csv('data\cyp2c19_veith.tab', sep='\t')
df

Unnamed: 0,Drug_ID,Drug,Y
0,6602638.0,CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl,0
1,644675.0,CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1,1
2,644851.0,Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1,1
3,644890.0,COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1,1
4,644968.0,COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1,1
...,...,...,...
12660,16758814.0,CCN1C(=O)[C@@H]2[C@@H](CC[C@@H]3C(=O)C=C[C@@H]...,0
12661,16758816.0,O=C1[C@H]2CC=C3[C@@H]([C@H](O)[C@H]4O[C@@H]4C3...,0
12662,16758817.0,CCN1C(=O)[C@H]2CC=C3[C@@H]([C@H](O)[C@H]4O[C@@...,1
12663,16758818.0,C[C@H](c1ccccc1)N1C(=O)[C@@H]2[C@@H](CC[C@@H]3...,0


In [19]:
cleaned_df = df[['Drug', 'Y']]
cleaned_df.columns = ['SMILE','Target']
print(cleaned_df.head())
print(cleaned_df.shape)
cleaned_df.to_csv('data\cleaned_raw_data.csv',index=False)


                                             SMILE  Target
0           CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl       0
1       CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1       1
2                 Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1       1
3  COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1       1
4             COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1       1
(12665, 2)


### data/cleaned_raw_data.csv is the all of the SMILE, target Pairs. 

# Gather a bunch of smiles that are similar to the smiles in the training data.

https://www.ebi.ac.uk/chembl/api/data/similarity/CN1C(=O)C=C(c2cccc(Cl)c2)c3cc(ccc13)[C@@](N)(c4ccc(Cl)cc4)c5cncn5C/80 
https://chembl.gitbook.io/chembl-interface-documentation/web-services/chembl-data-web-services

In [20]:
import requests
import json

call = f'https://www.ebi.ac.uk/chembl/api/data/similarity/CN1C(=O)C=C(c2cccc(Cl)c2)c3cc(ccc13)[C@@](N)(c4ccc(Cl)cc4)c5cncn5C/80?format=json'

response = requests.get(call)

In [21]:
resp_dict =json.loads(response.text)
resp_dict.keys()

dict_keys(['molecules', 'page_meta'])

In [22]:
molecules = resp_dict['molecules']
print(type(molecules))
print(len(molecules))

for i in molecules[:1]:
    print(type(molecules[0]))
    print(molecules[0].keys())

<class 'list'>
8
<class 'dict'>


In [23]:
molecules[0]['molecule_structures']['canonical_smiles']

'Cn1cncc1[C@](N)(c1ccc(Cl)cc1)c1ccc2c(c1)c(-c1cccc(Cl)c1)cc(=O)n2C'

In [24]:
def get_similar_smiles_from_chembl(base_smile:str, similarity_threshold=70):
    """
        Returns all molecules in the chembal database that are within 70 tanimoto similarity to the base_smile
        base_smile: str: A smile you want similar smiles to
        similarity_threshold: The minimum tanimoto similarity of all smiles returned. 
        
        Returns a list

        Calls this API here
        https://chembl.gitbook.io/chembl-interface-documentation/web-services/chembl-data-web-services 
        
    """
    try:
        call = f'https://www.ebi.ac.uk/chembl/api/data/similarity/{base_smile}/{similarity_threshold}?format=json'
        response = requests.get(call)
        molecules = json.loads(response.text)['molecules']
        similar_smile_strings = [drug['molecule_structures']['canonical_smiles'] for drug in molecules] # select only the smile string
        return similar_smile_strings
    except:
        print('Request Failed:' +str(base_smile))
        return []


In [25]:
training_df = cleaned_df.loc[:10000,:] # this is about 80% of the data reserved for training. The other 20% is not included. You might want to random shuffle it later
validaiton_df = cleaned_df.loc[10000:,:]
training_df.to_csv('data\in_training_data.csv',index=False)
validaiton_df.to_csv('data\in_validation_data.csv',index=False)
del [training_df, validaiton_df, split, cleaned_df, df] # this removes all references to the validation_data from memory so I have any leakage

# This will go through all of the smiles in the entire dataset

To Avoid Leakage I will only use the data in train_df

In [26]:
train_df  = pd.read_csv('data\in_training_data.csv')
print(train_df.head())
print(train_df.shape)

                                             SMILE  Target
0           CCCC(=O)Nc1ccc(N2CCN(CC)CC2)c(Cl)c1.Cl       0
1       CC(=O)N(c1ccc2oc(=O)sc2c1)S(=O)(=O)c1cccs1       1
2                 Clc1ccccc1-c1nc(-c2ccccc2)n[nH]1       1
3  COc1ccccc1C(c1nnnn1C(C)(C)C)N1CCN(Cc2ccncc2)CC1       1
4             COc1ccccc1CNC(=O)Cn1nnc(-c2ccncc2)n1       1
(10001, 2)


In [27]:
# similar_smiles = [get_similar_smiles_from_chembl(smile,70) for smile in list(train_df['SMILE'])]
# unique_smiles = set()

# for row in similar_smiles:
#     for s in row:
#         unique_smiles.add(s)

# smiles_to_write = pd.Series(list(unique_smiles))
# # write all the unique smiles to similar_smiles.csv
# smiles_to_write.to_csv('data\similar_smiles.csv', index=False, header=['similar_smiles'])

# This is broken. You just need to fix it later. 

In [28]:
# unique_smiles = set()

# for row in similar_smiles:
#     for s in row:
#         unique_smiles.add(s)

# smiles_to_write = pd.Series(list(unique_smiles))
# # write all the unique smiles to similar_smiles.csv
# smiles_to_write.to_csv('data\similar_smiles.csv', index=False, header=['similar_smiles'])

## The DeepChem Library has a variety of Molecule Featurizers.

Here I am setting up helper methods for a few different featurizers.

Docs on molecule Featurizers 

https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#molecule-featurizers


