In [3]:
!pip install rdkit 

Collecting rdkit
  Downloading rdkit-2023.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[K     |████████████████████████████████| 29.5 MB 11.3 MB/s eta 0:00:01
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [14]:
import pandas as pd  # Importing pandas library for data manipulation
from rdkit import Chem, DataStructs  # Importing RDKit for chemical informatics
import random  # Importing random module for generating random numbers
import numpy as np  # Importing numpy library for numerical operations
import rdkit.Chem.PropertyMol  # Importing RDKit module for molecular properties

# Reading a file containing SMILES representations of molecules
gen0_table = pd.read_csv('../input/setscmpz/gen0.smi', sep=',', header=None)

# Extracting the SMILES strings from the dataframe and converting them into a list
gen0 = list(gen0_table[0])[0:10000]

# Printing the number of molecules loaded
print("Number of molecules in gen0:", len(gen0))

Number of molecules in gen0: 9707


In [15]:
from rdkit import Chem  # Importing RDKit for chemical informatics

def validate_mols(list_of_smiles):
    """
    Validates a list of SMILES strings and returns a list of valid RDKit molecule objects.

    Parameters:
    list_of_smiles (list): List of SMILES strings representing molecules.

    Returns:
    list: List of valid RDKit molecule objects.
    """
    valid_mols = []  # List to store valid RDKit molecule objects
    for smi in list_of_smiles:
        mol = Chem.MolFromSmiles(smi)  # Attempt to create a molecule object from SMILES
        if mol is not None:
            valid_mols.append(mol)  # If molecule object is valid, append to list
    return valid_mols

def convert_mols_to_smiles(list_of_mols):
    """
    Converts a list of RDKit molecule objects to a list of corresponding SMILES strings.

    Parameters:
    list_of_mols (list): List of RDKit molecule objects.

    Returns:
    list: List of SMILES strings.
    """
    valid_smiles = [Chem.MolToSmiles(mol) for mol in list_of_mols]  # Convert each molecule to SMILES
    return valid_smiles

# These functions provide utilities to validate SMILES representations and convert between
# RDKit molecule objects and SMILES strings, which are common tasks in drug discovery and
# molecular analysis workflows.

In [16]:
# Validate SMILES representations of molecules in gen0 list
gen0_mols = validate_mols(gen0)

# Print the number of valid molecules obtained after validation
print("Number of valid molecules in gen0:", len(gen0_mols))


Number of valid molecules in gen0: 9707


In [17]:
from rdkit.Chem import RDKFingerprint
from rdkit import DataStructs
import numpy as np

def initialize_generation_from_mols(list_of_mols, desired_length):
    """
    Initialize a generation of molecules from a list of RDKit molecule objects.

    Parameters:
    list_of_mols (list): List of RDKit molecule objects.
    desired_length (int): Desired length of the generation.

    Returns:
    list: List of selected RDKit molecule objects.
    """
    assert desired_length > 30  # Ensure the desired length is greater than 30
    
    # Shuffle the list of molecules twice to introduce randomness
    random.shuffle(list_of_mols)
    random.shuffle(list_of_mols)
    
    # Prepare fingerprints for similarity calculations
    mol_fingerprints = [RDKFingerprint(mol) for mol in list_of_mols]
    
    # Initialize lists for selected and remaining molecules and fingerprints
    selected_mols = list_of_mols[:30]  # Select the first 30 molecules
    selected_fingerprints = mol_fingerprints[:30]
    remaining_mols = list_of_mols[30:]
    remaining_fingerprints = mol_fingerprints[30:]
    
    similarity_threshold = 0.05  # Initial similarity threshold
    
    # Loop until desired_length is reached
    while len(selected_mols) < desired_length:
        # Iterate over remaining molecules and fingerprints
        for fingerprint, mol in zip(remaining_fingerprints, remaining_mols):
            # Calculate maximum similarity with selected molecules
            max_similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint, selected_fingerprints))
            # Check if the molecule passes similarity threshold and not identical
            if max_similarity <= similarity_threshold and max_similarity < 1:
                selected_fingerprints.append(fingerprint)
                selected_mols.append(mol)
        # Print status update
        print("Completed loop with threshold at:", similarity_threshold, ". Length is currently:", len(selected_mols))
        # Increase similarity threshold for the next iteration
        similarity_threshold += 0.05
    
    return selected_mols


In [18]:
# Initialize a new generation of molecules from the gen0_mols list with a desired length of 1000
gen0_mols = initialize_generation_from_mols(gen0_mols, 1000)

# Print the length of the generated molecule generation
print("Length of the generated molecule generation:", len(gen0_mols))

Completed loop with threshold at: 0.05 . Length is currently: 31
Completed loop with threshold at: 0.1 . Length is currently: 32
Completed loop with threshold at: 0.15000000000000002 . Length is currently: 33
Completed loop with threshold at: 0.2 . Length is currently: 40
Completed loop with threshold at: 0.25 . Length is currently: 82
Completed loop with threshold at: 0.3 . Length is currently: 215
Completed loop with threshold at: 0.35 . Length is currently: 542
Completed loop with threshold at: 0.39999999999999997 . Length is currently: 1133
Length of the generated molecule generation: 1133


In [19]:
# Read the CSV file 'master_results_table.csv' into a DataFrame
master_table = pd.read_csv('../input/setscmpz/master_results_table.csv', sep=',')

# Get the number of rows in the DataFrame
num_rows = master_table.shape[0]

# Print the number of rows
print(num_rows)

1


In [20]:
def iterate_alpha(alpha_code):
    """
    Generate the next alpha code in lexicographic order.

    Parameters:
    alpha_code (str): The alpha code to iterate.

    Returns:
    str: The next alpha code.
    
    Raises:
    ValueError: If the length of the alpha code exceeds four characters.
    """
    numbers = []
    for letter in alpha_code:
        number = ord(letter)  # Convert letter to ASCII code
        numbers.append(number)
    
    # Check if the next letter exceeds 'Z' (90 in ASCII)
    if numbers[3] + 1 > 90:
        if numbers[2] + 1 > 90:
            if numbers[1] + 1 > 90:
                if numbers[0] + 1 > 90:
                    raise ValueError('Too long for alpha code')  # Raise an error if alpha code length exceeds four characters
                else:
                    numbers[3] = 65
                    numbers[2] = 65
                    numbers[1] = 65
                    numbers[0] = numbers[0] + 1
            else:
                numbers[3] = 65
                numbers[2] = 65
                numbers[1] = numbers[1] + 1
        else:
            numbers[3] = 65
            numbers[2] = numbers[2] + 1
    else:
        numbers[3] = numbers[3] + 1  # Increment the last letter by one if it doesn't exceed 'Z'

    # Convert ASCII codes back to letters
    new_code = ""
    for number in numbers:
        new_code += chr(number)
    
    return new_code

# Example usage:
print(iterate_alpha('AAAA'))  # Output: 'AAAB'

AAAB


In [21]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PropertyMol

def append_to_tracking_table(master_table, mols_to_append, source, generation):
    """
    Append new molecules to a tracking table and assign unique IDs to each.

    Parameters:
    master_table (DataFrame): The existing tracking table.
    mols_to_append (list): List of RDKit molecule objects to append.
    source (str): Source of the molecules (e.g., 'hiv', 'manual', 'baseline', etc.).
    generation (int): Generation number of the molecules.

    Returns:
    DataFrame: Updated tracking table with the appended molecules.
    list: List of PropertyMol objects for molecules to export.

    """
    mols_to_export = []  # List to store PropertyMol objects for molecules to export
    rows_list = []  # List to store dictionaries representing each molecule for DataFrame creation
    
    # Extract the rows corresponding to the given generation from the master table
    master_table_gen = master_table[master_table['gen'] == generation]
    
    # Determine the starting ID code for the new molecules based on the existing entries
    if master_table_gen.shape[0] == 0:
        id_code = 'AAAA'
    else:
        master_table_gen_ids = master_table_gen.sort_values('id', ascending=True)
        master_table_gen_max_id = master_table_gen_ids.tail(1)
        key = master_table_gen_max_id['id'].keys()[0]
        id_code = iterate_alpha(str(master_table_gen_max_id['id'][key]))
    
    # Load the training dataset for reference
    training_data = pd.read_csv('../input/setscmpz/dataset_cleansed.smi', header=None)
    training_set = set(list(training_data[0]))
    
    # Iterate over the molecules to append
    for mol in mols_to_append:
        pm = PropertyMol.PropertyMol(mol)  # Create PropertyMol object for the molecule
        title = 'id' + str(id_code) + 'gen' + str(generation)  # Generate a unique title for tracking
        pm.SetProp('Title', title)  # Set the 'Title' property for the molecule
        mols_to_export.append(pm)  # Append the PropertyMol object to the export list
        
        # Create a dictionary representing the molecule for DataFrame entry
        mol_dict = {}
        mol_dict['id'] = id_code
        mol_dict['gen'] = generation
        smile = Chem.MolToSmiles(mol)
        assert type(smile) == type('string')
        mol_dict['smile'] = smile
        
        # Determine the source of the molecule and assign accordingly
        if (source != 'hiv' and source != 'manual' and source != 'baseline') and (smile in training_set):
            mol_dict['source'] = 'training'
        else:
            mol_dict['source'] = source
        mol_dict['score'] = 99.9  # Placeholder score value
        
        rows_list.append(mol_dict)  # Append the dictionary to the list of rows
        id_code = iterate_alpha(id_code)  # Generate the next ID code
        
    df = pd.DataFrame(rows_list)  # Create a DataFrame from the list of rows
    return df, mols_to_export  # Return the updated DataFrame and the list of PropertyMol objects

In [22]:
# Append new molecules from gen0_mols to the master_table and prepare molecules for export
new_mols_to_test = append_to_tracking_table(master_table, gen0_mols, 'generated', 0)

# Extract DataFrame representing new molecules for further processing
mols_for_pd = new_mols_to_test[0]

# Extract list of PropertyMol objects representing molecules for export
mols_for_export = new_mols_to_test[1]

# Append the DataFrame of new molecules to the master_table
master_table = master_table.append(mols_for_pd)

# Print the number of molecules for export
print("Number of molecules for export:", len(mols_for_export))

Number of molecules for export: 1133


In [23]:
# Reset the index of the master_table DataFrame and drop the previous index column
master_table = master_table.reset_index(drop=True)

# Save the master_table DataFrame to a CSV file named 'master_results_table.csv' without including the index column
master_table.to_csv(r'master_results_table.csv', index=False)

In [24]:
# Read the HIV inhibitors dataset and extract SMILES strings
hiv_smiles = pd.read_csv('../input/setscmpz/hiv_inhibitors_cleaned.smi', sep=',', header=None)
hiv_smiles = list(hiv_smiles[0])

# Validate the SMILES representations to get RDKit molecule objects
hiv_mols = validate_mols(hiv_smiles)

# Read the master table from the CSV file
master_table = pd.read_csv('./master_results_table.csv', sep=',')

# Append the HIV inhibitors to the master table and prepare molecules for export
new_mols_to_test = append_to_tracking_table(master_table, hiv_mols, 'hiv', 0)
mols_for_pd = new_mols_to_test[0]  # DataFrame representing new molecules
mols_for_export += new_mols_to_test[1]  # List of PropertyMol objects for export

# Append the DataFrame of new molecules to the master table
master_table = master_table.append(mols_for_pd)

# Reset the index of the master table
master_table = master_table.reset_index(drop=True)

# Save the updated master table to a CSV file
master_table.to_csv(r'./master_results_table.csv', index=False)

In [25]:
# Read the manual testing dataset and extract SMILES strings
manual_smiles = pd.read_csv('../input/setscmpz/manual_testing_cleaned.smi', sep=',', header=None)
manual_smiles = list(manual_smiles[0])

# Validate the SMILES representations to get RDKit molecule objects
manual_mols = validate_mols(manual_smiles)

# Read the master table from the CSV file
master_table = pd.read_csv('./master_results_table.csv', sep=',')

# Append the manual testing molecules to the master table and prepare molecules for export
new_mols_to_test = append_to_tracking_table(master_table, manual_mols, 'manual', 0)
mols_for_pd = new_mols_to_test[0]  # DataFrame representing new molecules
mols_for_export += new_mols_to_test[1]  # List of PropertyMol objects for export

# Append the DataFrame of new molecules to the master table
master_table = master_table.append(mols_for_pd)

# Reset the index of the master table
master_table = master_table.reset_index(drop=True)

# Save the updated master table to a CSV file
master_table.to_csv(r'./master_results_table.csv', index=False)

In [27]:
def write_gen_to_sdf(mols_for_export, generation, batch_size):
    """
    Write molecules to SDF files in batches.

    Parameters:
    mols_for_export (list): List of PropertyMol objects to export.
    generation (int): Generation number of the molecules.
    batch_size (int): Size of each batch.

    Returns:
    list: List of PropertyMol objects.
    """
    if len(mols_for_export) > batch_size:
        # Calculate the number of batches needed
        batches = (len(mols_for_export) // batch_size) + 1
        for i in range(batches):
            # Extract a batch of molecules
            batch_to_export = mols_for_export[i * batch_size:(i + 1) * batch_size]
            # Write the batch to an SDF file
            w = Chem.SDWriter('gen' + str(generation) + '_batch_' + str(i + 1) + '.sdf')
            for m in batch_to_export:
                w.write(m)
    else:
        # Write all molecules to a single SDF file if the number is smaller than batch_size
        w = Chem.SDWriter('gen' + str(generation) + '.sdf')
        for m in mols_for_export:
            w.write(m)
    
    # Addressing the issue where the last line item of an SDF write may not be written correctly
    w = Chem.SDWriter('test.sdf')
    w.write(m)  # Arbitrarily write a molecule to ensure the last line item is written correctly
    
    return mols_for_export

# Call the function to write molecules to SDF files
write_gen_to_sdf(mols_for_export, generation=0, batch_size=1000)

[<rdkit.Chem.PropertyMol.PropertyMol at 0x7d1e02043670>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb439b70>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb4398f0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1e02031070>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb582630>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb40f7b0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb40fb30>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb3197f0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319870>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb40fcb0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319970>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb3198f0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb3199f0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319a70>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319af0>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319b70>,
 <rdkit.Chem.PropertyMol.PropertyMol at 0x7d1dfb319bf0>,
 <rdkit.Chem.PropertyMol.Proper

In [26]:
def write_gen_to_sdf(mols_for_export, generation_number, max_molecules):
    """
    Write molecules to an SDF file.

    Parameters:
    mols_for_export (list): List of PropertyMol objects to export.
    generation_number (int): Generation number of the molecules.
    max_molecules (int): Maximum number of molecules to export.

    Returns:
    None
    """
    # Ensure not exporting more molecules than the maximum allowed
    num_molecules = min(len(mols_for_export), max_molecules)
    # Open the SDF file for writing
    with Chem.SDWriter(f"gen{generation_number}_exported.sdf") as writer:
        # Write each molecule to the SDF file
        for i in range(num_molecules):
            writer.write(mols_for_export[i])

# Call the function to write molecules to an SDF file
write_gen_to_sdf(mols_for_export, 0, 2000)

# Print 'ok' to indicate successful execution
print('ok')

ok
