In [1]:
import os
import pandas as pd

In [5]:
# Set directory for data
DATA_DIR = 'data/data'

# List contents of the dataset directory
#os.listdir(DATA_DIR)

# Load a dataset
df_molecules = pd.read_csv(f'{DATA_DIR}/A2AR_LIGANDS.tsv', sep='\t', header=0, na_values=('NA', 'nan', 'NaN'))
print (df_molecules)

                      Activity_ID Quality    source                       CID  \
0     AACWUFIIMOHGSO_on_P29274_WT    High  ChEMBL31   ChEMBL31.compound.91968   
1     AAEYTMMNWWKSKZ_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.131451   
2     AAGFKZWKWAMJNP_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.100375   
3     AANUKDYJZPKTKN_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.123484   
4     AASXHCGIIQCKEE_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.419145   
...                           ...     ...       ...                       ...   
4077  ZYXGKENMDDPQIE_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.399078   
4078  ZYZWFDVXMLCIOU_on_P29274_WT    High  ChEMBL31   ChEMBL31.compound.94689   
4079  ZZBZWSYDXUPJCT_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.131461   
4080  ZZMIPZLRKFEGIA_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.202018   
4081  ZZXIVHBZTITUIW_on_P29274_WT    High  ChEMBL31  ChEMBL31.compound.248509   

                           

In [6]:
from qsprpred.data import MoleculeTable

# Load molecule data into a MoleculeTable object
molecule_data = MoleculeTable.fromTableFile(name="A2AR", filename=f'{DATA_DIR}/A2AR_LIGANDS.tsv')
print(molecule_data.getDF())

                           Activity_ID Quality    source  \
QSPRID                                                     
A2AR_0000  AACWUFIIMOHGSO_on_P29274_WT    High  ChEMBL31   
A2AR_0001  AAEYTMMNWWKSKZ_on_P29274_WT    High  ChEMBL31   
A2AR_0002  AAGFKZWKWAMJNP_on_P29274_WT    High  ChEMBL31   
A2AR_0003  AANUKDYJZPKTKN_on_P29274_WT    High  ChEMBL31   
A2AR_0004  AASXHCGIIQCKEE_on_P29274_WT    High  ChEMBL31   
...                                ...     ...       ...   
A2AR_4077  ZYXGKENMDDPQIE_on_P29274_WT    High  ChEMBL31   
A2AR_4078  ZYZWFDVXMLCIOU_on_P29274_WT    High  ChEMBL31   
A2AR_4079  ZZBZWSYDXUPJCT_on_P29274_WT    High  ChEMBL31   
A2AR_4080  ZZMIPZLRKFEGIA_on_P29274_WT    High  ChEMBL31   
A2AR_4081  ZZXIVHBZTITUIW_on_P29274_WT    High  ChEMBL31   

                                CID  \
QSPRID                                
A2AR_0000   ChEMBL31.compound.91968   
A2AR_0001  ChEMBL31.compound.131451   
A2AR_0002  ChEMBL31.compound.100375   
A2AR_0003  ChEMBL31.comp

In [7]:
from qsprpred.data.descriptors.fingerprints import MorganFP
from scaffviz.clustering.manifold import TSNE
from scaffviz.depiction.plot import Plot

# Adding descriptors without recalculating
molecule_data.addDescriptors([MorganFP(radius=3, nBits=2048)], recalculate=False)

# Visualization setup
tsne_plot = Plot(TSNE())

In [8]:
# First chemical space plot
tsne_plot.plot(
    molecule_data,
    color_by="pchembl_value_Median",
    interactive=False,
    color_continuous_scale="rdylgn"
)

# Second chemical space plot with interaction
tsne_plot.plot(
    molecule_data,
    color_by="pchembl_value_Median",
    interactive=True,
    card_data=["pchembl_value_Median", "source"],
    port=9292,
    viewport_height=800,
    color_continuous_scale="rdylgn"
)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [9]:
from drugex.training.generators import SequenceRNN
from drugex.data.corpus.vocabulary import VocSmiles

# Load the vocabulary and RNN model
MODEL_PATH = "data/models/pretrained/smiles-rnn/Papyrus05.5_smiles_rnn_PT/"
GPU_IDS = [0]

vocab = VocSmiles.fromFile(os.path.join(MODEL_PATH, "Papyrus05.5_smiles_rnn_PT.vocab"), encode_frags=False)
rnn_model = SequenceRNN(vocab, is_lstm=True, use_gpus=GPU_IDS)
rnn_model.loadStatesFromFile(os.path.join(MODEL_PATH, "Papyrus05.5_smiles_rnn_PT.pkg"))

In [11]:
# Generate molecules
generated_molecules = rnn_model.generate(num_samples=10000)
print(generated_molecules)


Generating molecules:   0%|          | 0/10000 [00:00<?, ?it/s]

                                                 SMILES  Valid
0     CC1CC(N(C(=O)CCCCCn2cc(Cl)cn2)c2ccc(C(=O)O)cc2...    1.0
1                    COCCN1CCc2nc(N)nc(-c3ccccc3)c2C1=O    1.0
2                     OC(CCCN1CCCCC1)(c1ccccc1)c1ccccc1    1.0
3     CN(c1ncnc2[nH]ccc12)C1CC(CS(=O)(=O)c2cc(Cl)ccc...    1.0
4                      COC(=O)C1C2CCC(CC1c1ccc(F)cc1)N2    1.0
...                                                 ...    ...
9995  O=C(NCC(c1cccs1)N1CCOCC1)c1ccc([N+](=O)[O-])cc...    1.0
9996     COCCN(CCOC)c1cc(C)nc2c(-c3c(C)cc(C)cc3C)cccc12    1.0
9997  CCC(NC(=O)c1cc(C(=O)NC(CC)c2ccc(F)cc2)n2c1COCC...    1.0
9998                   CCOc1ccccc1NC(=O)COn1nnc2ccccc21    1.0
9999                     O=C(C=Cc1cccs1)NNC(=O)c1ccncc1    1.0

[10000 rows x 2 columns]


In [12]:
from utils import smilesToGrid

# Convert SMILES to a grid image
smilesToGrid(generated_molecules.SMILES, molsPerRow=5, n_rows=5)

MolGridWidget()

In [14]:
from drugex.data.processing import Standardization, CorpusEncoder, RandomTrainTestSplitter
from drugex.data.corpus.corpus import SequenceCorpus
from drugex.data.datasets import SmilesDataSet
from drugex.logs import logger

logger.setLevel('ERROR')

# Data directory and standardization process
STANDARDIZATION_DIR = "data/datasets/encoded/rnn/"
os.makedirs(STANDARDIZATION_DIR, exist_ok=True)

In [15]:
# Standardize SMILES data
smiles_standardizer = Standardization(n_proc=12, chunk_size=1000)
standardized_smiles = smiles_standardizer.apply(df_molecules.SMILES)
print(standardized_smiles[:10])

Standardizing molecules (batch processing):   0%|          | 0/1 [00:00<?, ?it/s]

['Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)n1', 'Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12)CCCC3', 'O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1', 'CNC(=O)C12CC1C(n1cnc3c(NCc4cccc(Cl)c4)nc(C#CCCCCC(=O)OC)nc31)C(O)C2O', 'CCCn1c(=O)c2c(nc3cc(OC)ccn32)n(CCCNC(=O)c2ccc(S(C)(=O)=O)cc2)c1=O', 'Cn1c(-n2nccn2)nc2c(N)nc(CCc3ccccc3)nc21', 'Nc1nc(-c2ccccc2)cn2cc(-c3ccco3)nc12', 'CCCn1c(=O)c2c(nc3n2CCCN3c2ccc(OCCN3CCCC3)cc2)n(CCC)c1=O', 'N#Cc1c(-c2ccccc2)cc(-c2ccco2)nc1N', 'CCCn1c(=O)c2nc(-c3ccccc3)[nH]c2n(CCCOC)c1=O']


In [16]:
# Encode and split data
smiles_encoder = CorpusEncoder(
    SequenceCorpus,
    {'vocabulary': vocab, 'update_voc': False, 'throw': True},
    n_proc=12,
    chunk_size=1000
)

smiles_dataset = SmilesDataSet(os.path.join(STANDARDIZATION_DIR, 'ligand_corpus.tsv'), rewrite=True)
smiles_encoder.apply(standardized_smiles, collector=smiles_dataset)

SequenceCorpus (batch processing):   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
#Split the dataset into training and testing sets using a random train-test splitter
#test set that is 10% (0.1) of the total data size, maximum seed value of 10,000
data_splitter = RandomTrainTestSplitter(0.1, 1e4)
train_set, test_set = data_splitter(smiles_dataset.getData())
for dataset, name in zip([train_set, test_set], ['train', 'test']):
    pd.DataFrame(dataset, columns=smiles_dataset.getColumns()).to_csv(os.path.join(STANDARDIZATION_DIR, f'ligand_{name}.tsv'), header=True, index=False, sep='\t')

vocab.toFile(os.path.join(STANDARDIZATION_DIR, 'pretrained.vocab'))

In [18]:
# Configure data loaders
BATCH_SIZE = 256
train_data_set = SmilesDataSet(os.path.join(STANDARDIZATION_DIR, 'ligand_train.tsv'), voc=vocab)
train_data_loader = train_data_set.asDataLoader(batch_size=BATCH_SIZE)
test_data_set = SmilesDataSet(os.path.join(STANDARDIZATION_DIR, 'ligand_test.tsv'), voc=vocab)
test_data_loader = test_data_set.asDataLoader(batch_size=BATCH_SIZE)

In [19]:
from drugex.training.monitors import FileMonitor

#Set up model for fine-tuning
FINE_TUNED_MODEL_DIR = "data/models/finetuned/rnn"
finetuned_model_path = os.path.join(FINE_TUNED_MODEL_DIR, 'a2ar_finetuned')
finetuned_model = SequenceRNN(vocab, is_lstm=True, use_gpus=GPU_IDS)
finetuned_model.loadStatesFromFile(os.path.join(MODEL_PATH, 'Papyrus05.5_smiles_rnn_PT.pkg'))
monitor = FileMonitor(finetuned_model_path, save_smiles=True, reset_directory=True)
finetuned_model.fit(train_data_loader, test_data_loader, epochs=10, monitor=monitor)

Fitting model:   0%|          | 0/10 [00:00<?, ?it/s]

In [20]:
vocab.toFile(os.path.join(FINE_TUNED_MODEL_DIR, 'finetuned.vocab'))