In [None]:
# Note: This script requires the following libraries:
# - mordred
# - rdkit
# - pandas

# To install dependencies:
# pip install mordred rdkit pandas

Molecular Weight: 46.069
LogP: -0.0014000000000000123
Number of Hydrogen Bond Donors: 1
Number of Hydrogen Bond Acceptors: 1
Number of Rotatable Bonds: 0
Topological Polar Surface Area: 20.23
Number of Rings: 0


In [None]:
# Cell to test functionality
# Import necessary modules from RDKit
from rdkit import Chem
from rdkit.Chem import Descriptors

def generate_molecular_properties(smiles):
    # Parse the SMILES string to create a molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        raise ValueError("Invalid SMILES string")
    
    # Calculate various molecular properties
    properties = {
        'Molecular Weight': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'Number of Hydrogen Bond Donors': Descriptors.NumHDonors(mol),
        'Number of Hydrogen Bond Acceptors': Descriptors.NumHAcceptors(mol),
        'Number of Rotatable Bonds': Descriptors.NumRotatableBonds(mol),
        'Topological Polar Surface Area': Descriptors.TPSA(mol),
        'Number of Rings': Descriptors.RingCount(mol)
    }
    
    return properties

# Example usage
smiles = "CCO"  # Ethanol
properties = generate_molecular_properties(smiles)
for prop, value in properties.items():
    print(f"{prop}: {value}")

In [3]:
# Cell to validate that we can generate Mordred descriptors in the descriptors.txt file
import sys
sys.path.append('.')

from mordred import Calculator, descriptors
import pandas as pd
import numpy as np

def main():
    # Load columns from the file
    wanted_columns = list(open('descriptors.txt', 'r').readline().strip().split('\t'))
    print(f"Total columns in the file: {len(wanted_columns)}")

    # Get all available Mordred descriptor names
    calc = Calculator(descriptors)
    all_descriptor_names = [str(desc) for desc in calc.descriptors]
    print(f"Total Mordred descriptors available: {len(all_descriptor_names)}")

    # Find how many of the file columns match Mordred descriptors
    matching_columns = [col for col in wanted_columns if col in all_descriptor_names]
    print(f"Columns matching Mordred descriptors: {len(matching_columns)}")

    # Print some diagnostic information
    print("\nSample non-matching columns:")
    non_matching = [col for col in wanted_columns if col not in all_descriptor_names]
    print(non_matching[:20])  # Print first 20 non-matching columns

if __name__ == '__main__':
    main()

Total columns in the file: 849
Total Mordred descriptors available: 1826
Columns matching Mordred descriptors: 849

Sample non-matching columns:
[]


In [19]:
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
import numpy as np

def generate_mordred_descriptors(smiles_list, columns=None):
    """
    Generate Mordred descriptors for a list of SMILES strings.
    
    Parameters:
    -----------
    smiles_list : list
        List of SMILES strings to calculate descriptors for
    columns : list, optional
        List of specific columns to include
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing molecular descriptors for each SMILES string
    """
    # Create a calculator with all available descriptors
    calc = Calculator(descriptors)
    
    # If columns are specified, filter the descriptors
    if columns:
        # Filter descriptors to match the specified columns
        filtered_descriptors = [desc for desc in calc.descriptors if str(desc) in columns]
        calc = Calculator(filtered_descriptors)
    
    # Prepare results
    results = []
    
    # Calculate descriptors for each SMILES string
    for smiles in smiles_list:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        if mol is not None:
            # Calculate descriptors
            try:
                desc_values = calc(mol)
                # Convert to dictionary, adding SMILES as first column
                desc_dict = {'SMILES': smiles, **dict(desc_values)}
                results.append(desc_dict)
            except Exception as e:
                print(f"Error calculating descriptors for {smiles}: {e}")
        else:
            print(f"Invalid SMILES string: {smiles}")
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    
    # Ensure all specified columns are present, fill with NaN if missing
    # if columns:
    #     for col in columns:
    #         if col not in df.columns:
    #             df[col] = np.nan
        
    #     # Reorder columns to match the original specification
    #     df = df[['SMILES'] + [col for col in columns if col != 'SMILES']]
    
    return df

def read_descriptors(file_path):
    """
    Read the descriptors from a file.
    
    Parameters:
    -----------
    file_path : str
        Path to the file containing column names
    
    Returns:
    --------
    list
        List of column names
    """
    with open(file_path, 'r') as f:
        # Read the first line and split by tab
        columns = f.readline().strip().split('\t')
    return columns

def main():
    # Example usage
    smiles_list = [
        'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
        'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
        'CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO'  # Salbutamol
    ]
    
    # Generate descriptors
    descriptors_df = generate_mordred_descriptors(smiles_list)
    
    # Read the descriptors from descriptors.txt
    descriptors_file_path = 'descriptors.txt'
    descriptors = read_descriptors(descriptors_file_path)
    
    # Filter the columns to keep only those present in descriptors.txt
    filtered_descriptors_df = descriptors_df#[descriptors]
    
    # Print first few rows and basic info
    print(filtered_descriptors_df)
    print("\nTotal descriptors calculated:", len(filtered_descriptors_df.columns) - 1)  # -1 for SMILES column

#if __name__ == '__main__':
#    main()


In [22]:
# Example usage
smiles_list = [
    'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
    'CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO'  # Salbutamol
]

features_file_path = 'descriptors.txt'
features = read_descriptors(features_file_path)
# Generate descriptors
descriptors_df = generate_mordred_descriptors(smiles_list, features)
descriptors_df

Unnamed: 0,SMILES,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,CC(=O)OC1=CC=CC=C1C(=O)O,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,1,0,6,6,21,13,0,...,0.0,9.151333,43.556121,180.042259,8.573441,246,16,60.0,66.0,2.972222
1,CN1C=NC2=C1C(=O)N(C(=O)N2C)C,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,9,10,24,14,0,...,6.842683,9.824498,60.521485,194.080376,8.086682,258,25,76.0,94.0,3.027778
2,CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,6,6,38,17,0,...,0.0,9.519662,49.23903,239.152144,6.293477,560,22,82.0,90.0,3.763889
