# IMPORTS

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors


# LOAD FILE

In [2]:
# Assuming your CSV file is named 'example.csv' and is in the same directory as your script
file_path = '../../real350_aromatic_pyrimidine.csv'

# Load the CSV file into a DataFrame
pyrimidine = pd.read_csv(file_path)
display(pyrimidine)

Unnamed: 0,SMILES,ID
0,CS(=O)(=O)CCSc1ncncc1Br,Z1439298257
1,Brc1cncnc1SC1CCOCC1,Z1517711890
2,Cc1nnc(Sc2ncncc2Br)s1,Z1439143680
3,CCOC(=O)CSc1ncncc1Br,Z1422087508
4,Cn1nnnc1Sc1ncncc1Br,Z1439139352
...,...,...
16803384,O=C(CC1=Cc2ccccc2C1)N[C@H]1CCN(C(=O)c2cncnc2)C1,PV-003411773915
16803385,O=C(CC1=Cc2ccccc2C1)NC1CCN(C(=O)c2cncnc2)C1,PV-003411773916
16803386,N#Cc1cnc(CNC(=O)Nc2cccc3c2CN(C2CC2)C3)nc1N,PV-009425038621
16803387,Cc1ncncc1C(=O)N1CC(NC(=O)c2ccc3c(c2)CCC=C3)C1,PV-000493923085


# HELPER FUNCTIONS

In [6]:
def RDkit_descriptors(df, smiles_column='smiles'):
    mols = [Chem.MolFromSmiles(smiles) for smiles in df[smiles_column] if Chem.MolFromSmiles(smiles) is not None]
    
    if not mols:
        print("No valid molecules found.")
        return None

    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        mol = Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)

    descriptors_df = pd.DataFrame(Mol_descriptors, columns=desc_names)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

    return df_with_descriptors

# # Example usage:
# # Assuming you have a DataFrame named 'your_dataframe' with a column named 'smiles'
# your_dataframe = pd.DataFrame({'smiles': ['CCO', 'CCN', 'CCOCC', 'C1=CC=CC=C1']})
# your_dataframe_with_descriptors = RDkit_descriptors(your_dataframe)

# # Print the DataFrame with descriptors
# display(your_dataframe_with_descriptors)

In [None]:
df_descriptors = RDkit_descriptors(pyrimidine, smiles_column="SMILES")

In [None]:
display(df_descriptors)

In [None]:
###

In [4]:
from concurrent.futures import ThreadPoolExecutor

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return descriptors

def RDkit_descriptors_parallel(df, smiles_column='smiles', num_workers=4):
    smiles_list = df[smiles_column].tolist()
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(compute_descriptors, smiles_list))

    # Filter out None values (invalid molecules)
    valid_results = [res for res in results if res is not None]

    if not valid_results:
        print("No valid molecules found.")
        return None

    desc_names = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]).GetDescriptorNames()
    descriptors_df = pd.DataFrame(valid_results, columns=desc_names)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

    return df_with_descriptors

# # Example usage
# df = pd.DataFrame({'smiles': ['CCO', 'CCCC', 'CCN', 'O=C=O', 'C#N']})
# result = RDkit_descriptors_parallel(df)
# print(result)


  smiles  MaxAbsEStateIndex  MaxEStateIndex  MinAbsEStateIndex  \
0    CCO           6.520833        6.520833           2.854167   
1   CCCC           7.005208        7.005208           3.314583   
2    CCN           6.687500        6.687500           0.458333   
3  O=C=O           8.125000        8.125000           0.250000   
4    C#N           6.986111        6.986111           1.000000   

   MinEStateIndex       qed   MolWt  HeavyAtomMolWt  ExactMolWt  \
0       -2.880208  0.406808  46.069          40.021   46.041865   
1       -3.411458  0.431024  58.124          48.044   58.078250   
2       -2.947917  0.406237  45.085          38.029   45.057849   
3        0.250000  0.364167  44.009          44.009   43.989829   
4        1.000000  0.369797  27.026          26.018   27.010899   

   NumValenceElectrons  ...  fr_sulfide  fr_sulfonamd  fr_sulfone  \
0                   20  ...           0             0           0   
1                   26  ...           0             0         

In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return descriptors

def RDkit_descriptors_parallel(df, smiles_column='smiles', num_workers=4, batch_size=100000):
    total_smiles = len(df)
    print(f"Total SMILES in DataFrame: {total_smiles}")

    all_descriptors = []
    for i in range(0, total_smiles, batch_size):
        batch_df = df.iloc[i:i + batch_size]
        smiles_list = batch_df[smiles_column].tolist()

        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            results = list(executor.map(compute_descriptors, smiles_list))

        # Filter out None values (invalid molecules)
        valid_results = [res for res in results if res is not None]

        if not valid_results:
            print(f"No valid molecules found in batch {i // batch_size + 1}.")
            continue

        desc_names = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]).GetDescriptorNames()
        descriptors_df = pd.DataFrame(valid_results, columns=desc_names)
        all_descriptors.append(descriptors_df)

        print(f"Processed batch {i // batch_size + 1} ({min(i + batch_size, total_smiles)} out of {total_smiles} SMILES).")

    # Concatenate all descriptor DataFrames
    final_descriptors_df = pd.concat(all_descriptors, ignore_index=True)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, final_descriptors_df], axis=1)

    return df_with_descriptors

# Example usage
df = pd.DataFrame({'smiles': ['CCO', 'CCCC', 'CCN', 'O=C=O', 'C#N'] * 20000})  # Sample DataFrame with 100,000 SMILES
result = RDkit_descriptors_parallel(df)
print(result)


Total SMILES in DataFrame: 100000
