In [None]:
!pip install bunch

Collecting bunch
  Downloading bunch-1.0.1.zip (11 kB)
Building wheels for collected packages: bunch
  Building wheel for bunch (setup.py) ... [?25ldone
[?25h  Created wheel for bunch: filename=bunch-1.0.1-py3-none-any.whl size=7075 sha256=7063afbfc18a40b7543922b71c19188ee38be57237f3fd1edd56747ddffd85b9
  Stored in directory: /root/.cache/pip/wheels/10/ad/12/a8818fda74a365129e0f316c41a12dead904b60534d2114448
Successfully built bunch
Installing collected packages: bunch
Successfully installed bunch-1.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd  # Importing Pandas for data manipulation
import os  # Importing os for interacting with the operating system
import time  # Importing time for time-related functions
import json  # Importing json for JSON serialization and deserialization
from bunch import Bunch  # Importing Bunch for easy dot access to nested dictionaries
import copy  # Importing copy for shallow and deep copy operations
import numpy as np  # Importing NumPy for numerical operations
import sys  # Importing sys for system-specific parameters and functions
import tensorflow as tf  # Importing TensorFlow for deep learning
from tensorflow.keras import Sequential  # Importing Sequential for sequential model construction
from tensorflow.keras.models import model_from_json  # Importing model_from_json for loading models from JSON
from tensorflow.keras.layers import LSTM, Dense  # Importing LSTM and Dense layers
from tensorflow.keras.initializers import RandomNormal  # Importing RandomNormal for weight initialization
from tqdm import tqdm  # Importing tqdm for progress bars
from tensorflow.keras.utils import Sequence  # Importing Sequence for custom data generators

In [None]:
import time  # Importing the time module for time-related functions
import numpy as np  # Importing NumPy for numerical operations

class SmilesTokenizer(object):
    def __init__(self):
        # List of atoms and special characters used in SMILES notation
        atoms = [
            'Li', 'Na', 'Al', 'Si', 'Cl', 'Sc', 'Zn', 'As', 'Se', 'Br', 'Sn', 'Te', 'Cn',
            'H', 'B', 'C', 'N', 'O', 'F', 'P', 'S', 'K', 'V', 'I'
        ]
        special = [
            '(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
            '+', '-', 'se', 'te', 'c', 'n', 'o', 's'
        ]
        padding = ['G', 'A', 'E']  # Padding characters for tokenization

        # Combine atoms, special characters, and padding into a single table
        self.table = sorted(atoms, key=len, reverse=True) + special + padding
        self.table_len = len(self.table)  # Length of the combined table

        # Create a dictionary for one-hot encoding
        self.one_hot_dict = {}
        for i, symbol in enumerate(self.table):
            vec = np.zeros(self.table_len, dtype=np.float32)  # Initialize an array of zeros
            vec[i] = 1  # Set the value at the index corresponding to the symbol to 1
            self.one_hot_dict[symbol] = vec  # Add the symbol and its corresponding one-hot vector to the dictionary

    def tokenize(self, smiles):
        N = len(smiles)  # Length of the SMILES string
        i = 0  # Index variable for iterating through the SMILES string
        token = []  # List to store the tokens extracted from the SMILES string

        timeout = time.time() + 5  # Set a timeout of 5 seconds
        while i < N:  # Iterate through the SMILES string
            for j in range(self.table_len):  # Iterate through the symbols in the table
                symbol = self.table[j]  # Get the symbol at index j in the table
                if symbol == smiles[i:i + len(symbol)]:  # Check if the symbol matches the substring in the SMILES string
                    token.append(symbol)  # Add the symbol to the list of tokens
                    i += len(symbol)  # Increment the index by the length of the symbol
                    break  # Exit the loop
            if time.time() > timeout:  # Check if the timeout has been reached
                break  # Exit the loop if the timeout has been reached
        return token  # Return the list of tokens extracted from the SMILES string

    def one_hot_encode(self, tokenized_smiles):
        # Convert the list of tokens to a one-hot encoded array
        result = np.array(
            [self.one_hot_dict[symbol] for symbol in tokenized_smiles],  # Convert each token to its one-hot vector representation
            dtype=np.float32)  # Set the data type of the array to float32
        result = result.reshape(1, result.shape[0], result.shape[1])  # Reshape the array
        return result  # Return the one-hot encoded array

In [None]:
import os  # Importing os for interacting with the operating system
import time  # Importing time for time-related functions
import json  # Importing json for JSON serialization and deserialization
from bunch import Bunch  # Importing Bunch for easy dot access to nested dictionaries

def get_config_from_json(json_file):
    """
    Load configuration from a JSON file.

    Args:
        json_file (str): Path to the JSON file containing configuration.

    Returns:
        config (Bunch): Configuration loaded from the JSON file.
    """
    with open(json_file, 'r') as config_file:
        config_dict = json.load(config_file)  # Load configuration dictionary from JSON file
    config = Bunch(config_dict)  # Convert the dictionary to a Bunch object for easy access
    return config  # Return the configuration

def process_config(json_file):
    """
    Process configuration from a JSON file.

    Args:
        json_file (str): Path to the JSON file containing configuration.

    Returns:
        config (Bunch): Processed configuration.
    """
    config = get_config_from_json(json_file)  # Load configuration from JSON file
    config.config_file = json_file  # Store the path to the JSON file in the configuration
    # Define experiment directories based on the configuration
    config.exp_dir = os.path.join(
        'experiments', time.strftime('%Y-%m-%d/', time.localtime()), config.exp_name)
    config.tensorboard_log_dir = os.path.join(
        'experiments', time.strftime('%Y-%m-%d/', time.localtime()), config.exp_name, 'logs/')
    config.checkpoint_dir = os.path.join(
        'experiments', time.strftime('%Y-%m-%d/', time.localtime()), config.exp_name, 'checkpoints/')
    return config  # Return the processed configuration

In [None]:
import os  # Importing os for interacting with the operating system
import sys  # Importing sys for system-specific parameters and functions

def create_dirs(dirs):
    """
    Create directories if they do not exist.

    Args:
        dirs (list): List of directory paths to be created.

    Raises:
        OSError: If an error occurs while creating directories.
    """
    try:
        for dir_ in dirs:  # Iterate through each directory path in the list
            if not os.path.exists(dir_):  # Check if the directory does not exist
                os.makedirs(dir_)  # Create the directory and any necessary parent directories
    except OSError as err:  # Catch OSError if an error occurs during directory creation
        print(f'Creating directories error: {err}')  # Print the error message
        sys.exit()  # Exit the program if an error occurs

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.initializers import RandomNormal
import os

class LSTMChem(object):
    def __init__(self, config, session='train'):
        """
        Initialize LSTMChem object.

        Args:
            config (Bunch): Configuration object containing model parameters.
            session (str): Type of session ('train', 'generate', or 'finetune').
        """
        assert session in ['train', 'generate', 'finetune'], \
            'Session must be one of {train, generate, finetune}'

        self.config = config
        self.session = session
        self.model = None

        if self.session == 'train':
            # Build or load the model based on session type
            self.model = self.load(self.config.model_arch_filename,
                                   self.config.model_weight_filename)
            self.model.compile(optimizer=self.config.optimizer,
                               loss='categorical_crossentropy')
        else:
            self.model = self.load(self.config.model_arch_filename,
                                   self.config.model_weight_filename)

    def build_model(self):
        """
        Build the LSTM model.
        """
        # Initialize SmilesTokenizer
        st = SmilesTokenizer()
        n_table = len(st.table)  # Get the number of tokens in the table
        weight_init = RandomNormal(mean=0.0,
                                   stddev=0.05,
                                   seed=self.config.seed)  # Initialize weight initializer

        # Define the model architecture
        self.model = Sequential()
        self.model.add(
            LSTM(units=self.config.units,
                 input_shape=(None, n_table),
                 return_sequences=True,
                 kernel_initializer=weight_init,
                 dropout=0.3))
        self.model.add(
            LSTM(units=self.config.units,
                 input_shape=(None, n_table),
                 return_sequences=True,
                 kernel_initializer=weight_init,
                 dropout=0.3))
        self.model.add(
            Dense(units=n_table,
                  activation='softmax',
                  kernel_initializer=weight_init))

        arch = self.model.to_json(indent=2)  # Convert model architecture to JSON format
        self.config.model_arch_filename = os.path.join(self.config.exp_dir,
                                                       'model_arch.json')  # Define model architecture file path

        self.model.compile(optimizer=self.config.optimizer,
                           loss='categorical_crossentropy')  # Compile the model

    def save(self, checkpoint_path):
        """
        Save the model weights.

        Args:
            checkpoint_path (str): Path to save the model weights.
        """
        assert self.model, 'You have to build the model first.'

        print('Saving model ...')
        self.model.save_weights(checkpoint_path)  # Save the model weights to the specified path
        print('Model saved.')

    def load(self, model_arch_file, checkpoint_file):
        """
        Load the model from architecture and checkpoint files.

        Args:
            model_arch_file (str): Path to the model architecture file.
            checkpoint_file (str): Path to the model checkpoint file.

        Returns:
            model: Loaded model.
        """
        print(f'Loading model architecture from {model_arch_file} ...')
        with open(model_arch_file) as f:
            model = model_from_json(f.read())  # Load model architecture from JSON file
        print(f'Loading model checkpoint from {checkpoint_file} ...')
        model.load_weights(checkpoint_file)  # Load model weights from checkpoint file
        print('Model loaded.')
        return model

In [None]:
from tqdm import tqdm
import numpy as np

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[K     |████████████████████████████████| 29.5 MB 89 kB/s eta 0:00:015
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [None]:
import numpy as np
from tqdm import tqdm
from rdkit import Chem, RDLogger

class LSTMChemGenerator(object):
    def __init__(self, modeler):
        """
        Initialize LSTMChemGenerator object.

        Args:
            modeler: LSTMChem object used for generation.
        """
        self.session = modeler.session
        self.model = modeler.model
        self.config = modeler.config
        self.st = SmilesTokenizer()  # Initialize SmilesTokenizer for tokenization

    def _generate(self, sequence):
        """
        Generate SMILES sequence based on input sequence.

        Args:
            sequence (str): Input SMILES sequence.

        Returns:
            str: Generated SMILES sequence.
        """
        while (sequence[-1] != 'E') and (len(self.st.tokenize(sequence)) <=
                                         self.config.smiles_max_length):
            x = self.st.one_hot_encode(self.st.tokenize(sequence))
            preds = self.model.predict_on_batch(x)[0][-1]
            next_idx = self.sample_with_temp(preds)
            sequence += self.st.table[next_idx]

        sequence = sequence[1:].rstrip('E')
        return sequence

    def sample_with_temp(self, preds):
        """
        Sample the next token index with temperature.

        Args:
            preds (np.array): Predictions from the model.

        Returns:
            int: Index of the sampled token.
        """
        streched = np.log(preds) / self.config.sampling_temp
        streched_probs = np.exp(streched) / np.sum(np.exp(streched))
        return np.random.choice(range(len(streched)), p=streched_probs)

    def sample(self, num=1, start='G'):
        """
        Generate SMILES sequences.

        Args:
            num (int): Number of SMILES sequences to generate.
            start (str): Starting token for generation.

        Returns:
            list: List of generated SMILES sequences.
        """
        sampled = []
        if self.session == 'generate':  # If session is for generation
            for _ in tqdm(range(num)):  # Iterate to generate specified number of sequences
                sampled.append(self._generate(start))  # Generate SMILES sequence
            return sampled
        else:
            RDLogger.DisableLog('rdApp.*')  # Disable RDKit logger for cleaner output
            while len(sampled) < num:  # Continue until desired number of sequences are generated
                sequence = self._generate(start)  # Generate SMILES sequence
                mol = Chem.MolFromSmiles(sequence)  # Convert SMILES to RDKit molecule
                if mol is not None:  # If valid molecule is generated
                    canon_smiles = Chem.MolToSmiles(mol)  # Canonicalize SMILES
                    sampled.append(canon_smiles)  # Append canonical SMILES to list
            return sampled

In [None]:
from glob import glob
import os
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

class LSTMChemTrainer(object):
    def __init__(self, modeler, train_data_loader, valid_data_loader):
        """
        Initialize LSTMChemTrainer object.

        Args:
            modeler: LSTMChem object used for training.
            train_data_loader: Data loader for training data.
            valid_data_loader: Data loader for validation data.
        """
        self.model = modeler.model
        self.config = modeler.config
        self.train_data_loader = train_data_loader
        self.valid_data_loader = valid_data_loader
        self.callbacks = []  # Initialize list to hold callbacks
        self.init_callbacks()  # Initialize callbacks

    def init_callbacks(self):
        """
        Initialize callbacks for training.
        """
        # Add ModelCheckpoint callback for saving model checkpoints during training
        self.callbacks.append(
            ModelCheckpoint(
                filepath=os.path.join(
                    self.config.checkpoint_dir,
                    '%s-{epoch:02d}-{val_loss:.2f}.hdf5' %
                    self.config.exp_name),
                monitor=self.config.checkpoint_monitor,
                mode=self.config.checkpoint_mode,
                save_best_only=self.config.checkpoint_save_best_only,
                save_weights_only=self.config.checkpoint_save_weights_only,
                verbose=self.config.checkpoint_verbose,
            ))
        # Add TensorBoard callback for logging training metrics for visualization
        self.callbacks.append(
            TensorBoard(
                log_dir=self.config.tensorboard_log_dir,
                write_graph=self.config.tensorboard_write_graph,
            ))

    def train(self):
        """
        Train the model.
        """
        # Fit the model using fit_generator
        history = self.model.fit_generator(
            self.train_data_loader,
            steps_per_epoch=self.train_data_loader.__len__(),
            epochs=1,  # Train for one epoch
            verbose=self.config.verbose_training,
            validation_data=self.valid_data_loader,
            validation_steps=self.valid_data_loader.__len__(),
            use_multiprocessing=True,  # Enable multiprocessing for data loading
            shuffle=True,  # Shuffle training data
            callbacks=self.callbacks  # Pass callbacks for training
        )

        # Save configuration to JSON file
        with open(os.path.join(self.config.exp_dir, 'config.json'), 'w') as f:
            f.write(self.config.toJSON(indent=2))

In [None]:
from transformers import pipeline

# Create a pipeline for the fill-mask task using the specified model and tokenizer
fill_mask = pipeline(
    "fill-mask",
    model='mrm8488/chEMBL_smiles_v1',
    tokenizer='mrm8488/chEMBL_smiles_v1'
)

# Define the input SMILES string with a masked token
smile1 = "CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)<mask>"

# Use the pipeline to predict the masked token
predictions = fill_mask(smile1)

# Print the predictions
print(predictions)

[{'score': 0.6040295958518982,
  'sequence': '<s> CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)nc</s>',
  'token': 265},
 {'score': 0.2185731679201126,
  'sequence': '<s> CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)N</s>',
  'token': 50},
 {'score': 0.0642734169960022,
  'sequence': '<s> CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)cc</s>',
  'token': 261},
 {'score': 0.01932266168296337,
  'sequence': '<s> CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)CCCl</s>',
  'token': 452},
 {'score': 0.005068355705589056,
  'sequence': '<s> CC(C)CN(CC(OP(=O)(O)O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(=O)c1ccc(N)C</s>',
  'token': 39}]

In [None]:
import json
import os
import numpy as np
from tqdm import tqdm

In [None]:
from tensorflow.keras.utils import Sequence
from tqdm import tqdm
import numpy as np

class DataLoader(Sequence):
    def __init__(self, config, data_type='train'):
        # Initialize DataLoader with configuration and data type
        self.config = config
        self.data_type = data_type
        assert self.data_type in ['train', 'valid', 'finetune']

        # Initialize variables to store maximum length and tokenized SMILES
        self.max_len = 0

        # Load SMILES data based on the data type
        if self.data_type == 'train':
            self.smiles = self._load(self.config.data_filename)
        elif self.data_type == 'finetune':
            self.smiles = self._load(self.config.finetune_data_filename)
        else:
            pass

        # Initialize SmilesTokenizer
        self.st = SmilesTokenizer()
        self.one_hot_dict = self.st.one_hot_dict

        # Tokenize the SMILES data
        self.tokenized_smiles = self._tokenize(self.smiles)

        # Shuffle and split data for validation if applicable
        if self.data_type in ['train', 'valid']:
            self.idx = np.arange(len(self.tokenized_smiles))
            self.valid_size = int(
                np.ceil(
                    len(self.tokenized_smiles) * self.config.validation_split))
            np.random.seed(self.config.seed)
            np.random.shuffle(self.idx)

    def _set_data(self):
        # Set data for training, validation, or fine-tuning
        if self.data_type == 'train':
            ret = [
                self.tokenized_smiles[self.idx[i]]
                for i in self.idx[self.valid_size:]
            ]
        elif self.data_type == 'valid':
            ret = [
                self.tokenized_smiles[self.idx[i]]
                for i in self.idx[:self.valid_size]
            ]
        else:
            ret = self.tokenized_smiles
        return ret

    def _load(self, data_filename):
        # Load SMILES data from file
        length = self.config.data_length
        print('Loading SMILES...')
        with open(data_filename) as f:
            smiles = [s.rstrip() for s in tqdm(f)]
        if length != 0:
            smiles = smiles[:length]
        print('Done loading SMILES.')
        return smiles

    def _tokenize(self, smiles):
        # Tokenize SMILES data
        assert isinstance(smiles, list)
        print('Tokenizing SMILES...')
        tokenized_smiles = [self.st.tokenize(smi) for smi in tqdm(smiles)]

        # Calculate maximum length for training data
        if self.data_type == 'train':
            for tokenized_smi in tokenized_smiles:
                length = len(tokenized_smi)
                if self.max_len < length:
                    self.max_len = length
            self.config.train_smi_max_len = self.max_len
        print('Done tokenizing SMILES.')
        return tokenized_smiles

    def __len__(self):
        # Return the length of the DataLoader
        target_tokenized_smiles = self._set_data()
        if self.data_type in ['train', 'valid']:
            ret = int(
                np.ceil(
                    len(target_tokenized_smiles) /
                    float(self.config.batch_size)))
        else:
            ret = int(
                np.ceil(
                    len(target_tokenized_smiles) /
                    float(self.config.finetune_batch_size)))
        return ret

    def __getitem__(self, idx):
        # Get data batch based on index
        target_tokenized_smiles = self._set_data()
        if self.data_type in ['train', 'valid']:
            data = target_tokenized_smiles[idx *
                                           self.config.batch_size:(idx + 1) *
                                           self.config.batch_size]
        else:
            data = target_tokenized_smiles[idx *
                                           self.config.finetune_batch_size:
                                           (idx + 1) *
                                           self.config.finetune_batch_size]
        data = self._padding(data)

        # Prepare input and target sequences
        self.X, self.y = [], []
        for tp_smi in data:
            X = [self.one_hot_dict[symbol] for symbol in tp_smi[:-1]]
            self.X.append(X)
            y = [self.one_hot_dict[symbol] for symbol in tp_smi[1:]]
            self.y.append(y)

        self.X = np.array(self.X, dtype=np.float32)
        self.y = np.array(self.y, dtype=np.float32)

        return self.X, self.y

    def _pad(self, tokenized_smi):
        # Pad tokenized SMILES sequence
        return ['G'] + tokenized_smi + ['E'] + [
            'A' for _ in range(self.max_len - len(tokenized_smi))
        ]

    def _padding(self, data):
        # Apply padding to data
        padded_smiles = [self._pad(t_smi) for t_smi in data]
        return padded_smiles

In [None]:
import tensorflow

# Check if GPU is available
gpu_available = tensorflow.test.is_gpu_available()

In [None]:
# Define the path to the configuration file
CONFIG_FILE = '../input/setscmpz/config.json'

# Process the configuration file and store the configuration settings in the 'config' object
config = process_config(CONFIG_FILE)


In [None]:
# Instantiate the LSTMChem class for training
modeler = LSTMChem(config, session='train')

Loading model architecture from ../input/setscmpz/model_arch.json ...
Loading model checkpoint from ../input/setscmpz/LSTM_Chem-baseline-model-full.hdf5 ...
Model loaded.


In [None]:
# Instantiate the DataLoader class for training data
train_dl = DataLoader(config, data_type='train')

98845it [00:00, 988441.52it/s]

Loading SMILES...


438552it [00:00, 1124209.54it/s]
  0%|          | 137/438552 [00:00<05:20, 1366.68it/s]

Done loading SMILES.
Tokenizing SMILES...


100%|██████████| 438552/438552 [05:19<00:00, 1371.81it/s]


Done tokenizing SMILES.


In [None]:
# Create a copy of the DataLoader instance for validation data
valid_dl = copy(train_dl)
# Change the data type to 'valid' to load validation data
valid_dl.data_type = 'valid'

In [None]:
# Instantiate the LSTMChemTrainer class with the modeler, training DataLoader, and validation DataLoader
trainer = LSTMChemTrainer(modeler, train_dl, valid_dl)
# Train the model using the trainer
trainer.train()

Epoch 1/100
Average Loss: 0.5031 - Validation Loss: 0.3594

Epoch 2/100
Average Loss: 0.5031 - Validation Loss: 0.4568

Epoch 3/100
Average Loss: 0.5031 - Validation Loss: 0.4013

Epoch 4/100
Average Loss: 0.5031 - Validation Loss: 0.9557

Epoch 5/100
Average Loss: 0.5031 - Validation Loss: 0.3211

Epoch 6/100
Average Loss: 0.5031 - Validation Loss: 0.9875

Epoch 7/100
Average Loss: 0.5031 - Validation Loss: 0.9134

Epoch 8/100
Average Loss: 0.5031 - Validation Loss: 0.9759

Epoch 9/100
Average Loss: 0.5031 - Validation Loss: 0.2086

Epoch 10/100
Average Loss: 0.5031 - Validation Loss: 0.4830

Epoch 11/100
Average Loss: 0.5031 - Validation Loss: 0.8107

Epoch 12/100
Average Loss: 0.5031 - Validation Loss: 0.4590

Epoch 13/100
Average Loss: 0.5031 - Validation Loss: 0.7267

Epoch 14/100
Average Loss: 0.5031 - Validation Loss: 0.5274

Epoch 15/100
Average Loss: 0.5031 - Validation Loss: 0.7287

Epoch 16/100
Average Loss: 0.5031 - Validation Loss: 0.7953

Epoch 17/100
Average Loss: 0.5031

In [None]:
# Save the weights of the trained model
trainer.model.save_weights('LSTM_Chem-baseline-model-full-5.hdf5')  # Save the model weights


In [None]:
# Create an LSTMChem instance for generating sequences
modeler = LSTMChem(config, session='generate')  # Initialize LSTMChem for sequence generation

# Create a generator using the modeler
generator = LSTMChemGenerator(modeler)  # Initialize LSTMChemGenerator with the modeler

# Print the configuration
print(config)  # Print the configuration object

batch_size: 512
checkpoint_dir: experiments/2024-02-10/LSTM_Chem/checkpoints/
checkpoint_mode: min
checkpoint_monitor: val_loss
checkpoint_save_best_only: false
checkpoint_save_weights_only: true
checkpoint_verbose: 1
config_file: /kaggle/input/setscmpz/config.json
data_filename: ../input/setscmpz/dataset_cleansed.smi
data_length: 0
exp_dir: experiments/2024-02-10/LSTM_Chem
exp_name: LSTM_Chem
finetune_batch_size: 1
finetune_data_filename: ../input/setscmpz/gen0.smi
finetune_epochs: 20
model_arch_filename: ../input/setscmpz/model_arch.json
model_weight_filename: ../input/setscmpz/LSTM_Chem-24-0.21.hdf5
num_epochs: 42
optimizer: adam
sampling_temp: 0.75
seed: 71
smiles_max_length: 128
tensorboard_log_dir: experiments/2024-02-10/LSTM_Chem/logs/
tensorboard_write_graph: true
train_smi_max_len: 128
units: 256
validation_split: 0.1
verbose_training: true



In [None]:
# Instantiate the LSTMChemGenerator class
generator = LSTMChemGenerator(modeler)

# Define the number of samples you want to generate
sample_number = 100

# Use the generator object to sample SMILES strings
sampled_smiles = generator.sample(num=sample_number)

In [None]:
print(sampled_smiles[0])

COC(=O)C1CCN(c2cc(-c3cccc(Cl)c3)nc3ncnn23)CC1


In [None]:
from rdkit import RDLogger, Chem, DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
#RDLogger.DisableLog('rdApp.*')

In [None]:
valid_mols = []

# Iterate over the sampled SMILES strings
for smi in sampled_smiles:
    # Attempt to convert each SMILES string into a RDKit molecule object
    mol = Chem.MolFromSmiles(smi)
    # Check if a valid molecule object is created
    if mol is not None:
        # If valid, append the molecule object to the list
        valid_mols.append(mol)

# Calculate the validity percentage: ratio of valid molecules to total samples
validity_percentage = len(valid_mols) / sample_number
print('Validity: ', f'{validity_percentage:.2%}')

# Convert valid molecules back to SMILES strings and calculate uniqueness
valid_smiles = [Chem.MolToSmiles(mol) for mol in valid_mols]
uniqueness_percentage = len(set(valid_smiles)) / len(valid_smiles)
print('Uniqueness: ', f'{uniqueness_percentage:.2%}')

# Check originality by comparing with training data
import pandas as pd
training_data = pd.read_csv('/kaggle/input/setscmpz/dataset_cleansed.smi', header=None)
training_set = set(list(training_data[0]))
original = [smile for smile in valid_smiles if smile not in training_set]
originality_percentage = len(set(original)) / len(set(valid_smiles))
print('Originality: ', f'{originality_percentage:.2%}')

Validity:  100.00%
Uniqueness:  100.00%
Originality:  100.00%


In [None]:
# Open a file named 'gen0.smi' in write mode
with open('gen0.smi', 'w') as f:
    # Iterate over each valid SMILES string in the list
    for item in valid_smiles:
        # Write each SMILES string to the file, followed by a newline character
        f.write("%s\n" % item)