In [1]:
import pandas as pd
from tqdm import tqdm
import os
import requests
import zipfile
from DeepPurpose import oneliner
from DeepPurpose.dataset import *

## Ligand conversion

In [2]:
DATA_FOLDER = "./molecules/"
url = 'https://storage.googleapis.com/indaba-challenge/molecules.zip'
r = requests.get(url, allow_redirects=True)
open('molecules.zip', 'wb').write(r.content)

# # Extract all the files with 
with zipfile.ZipFile('molecules.zip', 'r') as zip_ref:
    zip_ref.extractall(DATA_FOLDER)

In [3]:
extension = ".params"

In [4]:
import os
lines = None
smiles = 'SMILES'
DATA_FOLDER = "molecules/"
data = []
list_ligand_issues = []
start = -20
name = None
for filename in tqdm(os.listdir(DATA_FOLDER)):
    if filename.endswith(extension):
        filename = ".".join(filename.split('.')[:-1])+'.sdf'
        try:
            lines = open('{}{}'.format(DATA_FOLDER, filename)).read().split('\n')
        except:
            continue
        for num, line in enumerate(lines[start:]):
            if 'NAME' in line:
                name = lines[len(lines) + num + start + 1]
                continue
            if smiles in line:
                temp_smile = lines[len(lines) + num + start + 1]
                break
        if name is None:
            name = ".".join(filename.split('.')[:-1])
            
        name = "_".join(name.split()) + "_" + ".".join(filename.split('.')[:-1])
        if temp_smile is not None:
            data.append((name, temp_smile, filename))
        else:
            list_ligand_issues.append(DATA_FOLDER + filename)
        temp_smile = None
        name = None

100%|██████████| 12095/12095 [00:00<00:00, 39783.70it/s]


In [5]:
data = pd.DataFrame(data=data, columns=['Name', 'Smiles', 'Filename'])

In [6]:
data.head()

Unnamed: 0,Name,Smiles,Filename
0,pipethanate_RZWPJFMNFATBEG-UHFFFAOYSA-N,O=C(OCCN1CCCCC1)C(O)(c1ccccc1)c1ccccc1,RZWPJFMNFATBEG-UHFFFAOYSA-N.sdf
1,carboquone_SHHKQEUPHAENFK-SECBINFHSA-N,COC(COC(N)=O)C1=C(N2CC2)C(=O)C(C)=C(N2CC2)C1=O,SHHKQEUPHAENFK-SECBINFHSA-N.sdf
2,propoxycaine_CAJIGINSTLKQMM-UHFFFAOYSA-N,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC,CAJIGINSTLKQMM-UHFFFAOYSA-N.sdf
3,trimetazidine_UHWVSEOVJBQKBE-UHFFFAOYSA-N,COc1ccc(CN2CCNCC2)c(OC)c1OC,UHWVSEOVJBQKBE-UHFFFAOYSA-N.sdf
4,tafluprost_WSNODXPBBALQOF-VEJSHDCNSA-N,CC(C)OC(=O)CCC/C=C\C[C@H]1[C@@H](O)C[C@@H](O)[...,WSNODXPBBALQOF-VEJSHDCNSA-N.sdf


In [7]:
data.to_csv('Ligand_smiles.csv', index=False)

In [8]:
len(data)

4020

# Sample Drugs and Targets

In [13]:
targets = pd.read_csv('receptors.csv')[6:20]
ligands = pd.read_csv('Ligand_smiles.csv')

In [14]:
ligands.head()

Unnamed: 0,Name,Smiles,Filename
0,pipethanate_RZWPJFMNFATBEG-UHFFFAOYSA-N,O=C(OCCN1CCCCC1)C(O)(c1ccccc1)c1ccccc1,RZWPJFMNFATBEG-UHFFFAOYSA-N.sdf
1,carboquone_SHHKQEUPHAENFK-SECBINFHSA-N,COC(COC(N)=O)C1=C(N2CC2)C(=O)C(C)=C(N2CC2)C1=O,SHHKQEUPHAENFK-SECBINFHSA-N.sdf
2,propoxycaine_CAJIGINSTLKQMM-UHFFFAOYSA-N,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC,CAJIGINSTLKQMM-UHFFFAOYSA-N.sdf
3,trimetazidine_UHWVSEOVJBQKBE-UHFFFAOYSA-N,COc1ccc(CN2CCNCC2)c(OC)c1OC,UHWVSEOVJBQKBE-UHFFFAOYSA-N.sdf
4,tafluprost_WSNODXPBBALQOF-VEJSHDCNSA-N,CC(C)OC(=O)CCC/C=C\C[C@H]1[C@@H](O)C[C@@H](O)[...,WSNODXPBBALQOF-VEJSHDCNSA-N.sdf


In [15]:
targets.head()

Unnamed: 0,receptor,amino acid sequence
6,E9AJZ0_0_apo,MQPAQSPPVAPPSVPAAAPKKTPIDISALKLKMSPSVRATLAAAGV...
7,Q9GPZ9_0_apo,MKVENSKMGVKREQSHSNEDEEINEEDLNWWEQENLRIAMKGERRW...
8,Q9NGZ9_0_apo,DTVVGCCSLRVEHIQLMPDNIVRFDFLGKDSIRYQNDVAVLPEVYA...
9,A0A504XCW5_0_apo,MKVENSKMGVKREQSHSNEDEEINEEDLNWWEQENLRIAMKGERRW...
10,A4HPA0_0_apo,MVQTKEIALEQLALTLTGDASWSSGPIYVVCDVGGTSARVGFSQAS...


# Drug Selection

In [17]:
def formatLigand(ligands, filename):
    with open(filename, 'w') as f:
        for line in ligands.iterrows():
            f.write('{} {}\n'.format(line[1]['Name'], line[1]['Smiles']))

def formatTarget(target, filename):
    with open(filename, 'w') as f:
        for line in target.iterrows():
            f.write('{} {}\n'.format(line[1]['receptor'].split()[0], line[1]['amino acid sequence']))

def BatchDrugRepurposing(ligands, target, step, filenameLigands='repurpose.txt', filenameTarget='target.txt'):
    formatLigand(ligands, filenameLigands)
    formatTarget(target, filenameTarget)
    targetName = target.iloc[0]['receptor']
    oneliner.repurpose(*read_file_target_sequence(filenameTarget), *read_file_repurposing_library(filenameLigands))
    outputFilenameRepurposing ="{}_{}".format(targetName, step)
    os.rename(r'./save_folder/results_aggregation/repurposing.txt', r'./save_folder/results_aggregation/' + targetName + str(step) + '.txt')
    
def DrurgRepurposing(ligands, target, batchSize = 100):
    ligandsSize = len(ligands)
    nStep = ligandsSize // batchSize
    remain = ligandsSize % batchSize != 0
    for step in tqdm(range(nStep), position=0, leave=True):
        start, end = step * batchSize, (step + 1) * batchSize
        batchLigands = ligands[start:end]
        BatchDrugRepurposing(batchLigands, target, step)
    
    if remain:
        start, end = nStep * batchSize, ligandsSize
        batchLigands = ligands[start:end]
        BatchDrugRepurposing(batchLigands, target, nStep)

In [18]:
listTargets = targets['receptor'].tolist()

In [None]:
%%time
for target in tqdm(listTargets, position=0, leave=True):
    target = targets[targets['receptor'] == target]
    DrurgRepurposing(ligands, target)

  0%|          | 0/40 [00:00<?, ?it/s]

Save path not found or given and set to default: './save_folder/'. 
Loading customized repurposing dataset...
Beginning Downloading Pretrained Model...
Note: if you have already download the pretrained model before, please stop the program and set the input parameter 'pretrained_dir' to the path
Downloading finished... Beginning to extract zip file...
pretrained model Successfully Downloaded...
Using pretrained model and making predictions...
repurposing...
Drug Target Interaction Prediction Mode...
in total: 100 drug-target pairs
encoding drug...
unique drugs: 100
encoding protein...
unique target sequence: 1
Done.
predicting...
---------------
Predictions from model 1 with drug encoding MPNN and target encoding CNN are done...
-------------
repurposing...
Drug Target Interaction Prediction Mode...
in total: 100 drug-target pairs
encoding drug...
unique drugs: 100
encoding protein...
unique target sequence: 1
Done.
predicting...


## Data Merging

In [31]:
rootPath = './save_folder/results_aggregation/'

In [33]:
def readFile(rootPath, filename):
    data = []
    with open(rootPath+filename) as file:
        lines = file.readlines()[3:-1]
        for line in lines:
            line = line.replace(' ', '')
            items = line.split('|')
            data.append([items[2], items[3], items[4], None])
    os.system('rm {}'.format(rootPath, filename))
    return data

def mergeFile(targetName, rootPath, original=True):
    data = []
    for filename in tqdm(os.listdir(rootPath), position=0, leave=True):
        if targetName in filename:
            data.extend(readFile(rootPath, filename))
    data = pd.DataFrame(data=data, columns=['LigandName', 'TargetName', 'BindingScore', 'InterfE'])
    data = data.sort_values(by = ['BindingScore'])
    data.to_csv("Ligand_{}.csv".format(targetName), index=False)
    return data

In [34]:
for target in tqdm(listTargets, position=0, leave=True):
    mergeFile(target, rootPath)

100%|██████████| 411/411 [00:00<00:00, 9043.29it/s]
100%|██████████| 411/411 [00:00<00:00, 9449.69it/s]
100%|██████████| 411/411 [00:00<00:00, 10538.71it/s]
100%|██████████| 411/411 [00:00<00:00, 9281.23it/s]
100%|██████████| 411/411 [00:00<00:00, 8195.08it/s]
100%|██████████| 411/411 [00:00<00:00, 9327.74it/s]
100%|██████████| 411/411 [00:00<00:00, 8467.26it/s]
100%|██████████| 411/411 [00:00<00:00, 8388.20it/s]
100%|██████████| 411/411 [00:00<00:00, 10015.97it/s]
100%|██████████| 411/411 [00:00<00:00, 11126.98it/s]
100%|██████████| 411/411 [00:00<00:00, 1805087.90it/s]
100%|██████████| 411/411 [00:00<00:00, 1773517.43it/s]
100%|██████████| 411/411 [00:00<00:00, 1396968.35it/s]
100%|██████████| 13/13 [00:00<00:00, 18.43it/s]


In [48]:
for file in os.listdir('.'):
    if file.endswith('lig.csv') or file.endswith('apo.csv'):
        data = pd.read_csv(file).sort_values(by=['BindingScore'])
        data.to_csv(file, index=False)

In [49]:
result = pd.read_csv('Ligand_D0VWU6.0.apo.csv')

In [50]:
result.head()

Unnamed: 0,LigandName,TargetName,BindingScore,InterfE
0,mometasone_furoate_WOFMFGQZHJDGCX-ZULDAHANSA-N,D0VWU6.0.apo,4.29,
1,ulobetasol_propionate_BDSYKGHYMJNPAB-LICBFIPMSA-N,D0VWU6.0.apo,7.76,
2,clobetasone_butyrate_FBRAWBYQGRLCEK-AVVSTMBFSA-N,D0VWU6.0.apo,8.15,
3,clobetasol_propionate_CBGUOGMQLZIXBE-XGQKBEPLSA-N,D0VWU6.0.apo,8.16,
4,flumetasone_WXURHACBFYSXBI-GQKYHHCASA-N,D0VWU6.0.apo,10.53,


In [39]:
result[-1:]

Unnamed: 0,LigandName,TargetName,BindingScore,InterfE
1135,glucose_WQZGKKKJIJFFOK-VFUOTHLCSA-N,D0VWU6.0.apo,1829498.66,
