# IMPORTS

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors


# LOAD FILE

In [11]:
# Assuming your CSV file is named 'example.csv' and is in the same directory as your script
# file_path = '../../real350_aromatic_pyrimidine.csv'
file_path = 'test_descriptors.csv'

# Load the CSV file into a DataFrame
pyrimidine = pd.read_csv(file_path)
display(pyrimidine)

Unnamed: 0,SMILES,ID,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,Z1346689858,C=C(Br)Cn1cnccc1=O,11.673693,11.673693,0.208866,-2.690972,0.741147,215.050,207.994,213.974175,...,0,0,0,0,0,0,0,0,0,0
1,Z1672297658,CNC(=O)NCc1cncs1,11.580737,11.580737,0.363727,-3.148273,0.683239,171.225,162.153,171.046633,...,0,0,0,0,0,1,0,0,0,1
2,Z371870842,CNC(=O)Nc1ncc(C)s1,11.542029,11.542029,0.030440,-3.065184,0.666767,171.225,162.153,171.046633,...,0,0,0,0,0,1,0,0,0,1
3,Z1430356395,CCNC(=O)Nc1ccsc1,11.814080,11.814080,0.129190,-3.353591,0.698809,170.237,160.157,170.051384,...,0,0,0,0,0,0,0,1,0,1
4,Z432794230,CNC(=O)NCc1ccsc1,11.649992,11.649992,0.418264,-3.185236,0.687130,170.237,160.157,170.051384,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21003,Z2771727414,Cc1cc(N2CCC([C@H](C)O)CC2)ccn1,8.508176,8.508176,0.489782,-4.377418,0.827705,220.316,200.156,220.157563,...,0,0,0,0,0,0,0,0,0,0
21004,Z2830079023,C[C@H](O)C1CCN(Cc2ccncc2)CC1,8.483040,8.483040,0.742097,-4.424028,0.841568,220.316,200.156,220.157563,...,0,0,0,0,0,0,0,0,0,0
21005,Z2830079015,C[C@H](O)C1CCN(Cc2ccco2)CC1,8.434394,8.434394,0.708903,-4.381245,0.825332,209.289,190.137,209.141579,...,0,0,0,0,0,0,0,0,0,0
21006,Z2739442718,N#Cc1cc(Cl)ccc1N1CC(CN)C1,9.292218,9.292218,0.112724,-3.438976,0.825689,221.691,209.595,221.071975,...,0,0,0,0,0,0,0,0,0,0


# HELPER FUNCTIONS

In [6]:
def RDkit_descriptors(df, smiles_column='smiles'):
    mols = [Chem.MolFromSmiles(smiles) for smiles in df[smiles_column] if Chem.MolFromSmiles(smiles) is not None]
    
    if not mols:
        print("No valid molecules found.")
        return None

    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        mol = Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)

    descriptors_df = pd.DataFrame(Mol_descriptors, columns=desc_names)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

    return df_with_descriptors

# # Example usage:
# # Assuming you have a DataFrame named 'your_dataframe' with a column named 'smiles'
# your_dataframe = pd.DataFrame({'smiles': ['CCO', 'CCN', 'CCOCC', 'C1=CC=CC=C1']})
# your_dataframe_with_descriptors = RDkit_descriptors(your_dataframe)

# # Print the DataFrame with descriptors
# display(your_dataframe_with_descriptors)

In [None]:
df_descriptors = RDkit_descriptors(pyrimidine, smiles_column="SMILES")

In [None]:
display(df_descriptors)

In [None]:
###

In [4]:
from concurrent.futures import ThreadPoolExecutor

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return descriptors

def RDkit_descriptors_parallel(df, smiles_column='smiles', num_workers=4):
    smiles_list = df[smiles_column].tolist()
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(compute_descriptors, smiles_list))

    # Filter out None values (invalid molecules)
    valid_results = [res for res in results if res is not None]

    if not valid_results:
        print("No valid molecules found.")
        return None

    desc_names = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]).GetDescriptorNames()
    descriptors_df = pd.DataFrame(valid_results, columns=desc_names)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

    return df_with_descriptors

# # Example usage
# df = pd.DataFrame({'smiles': ['CCO', 'CCCC', 'CCN', 'O=C=O', 'C#N']})
# result = RDkit_descriptors_parallel(df)
# print(result)


  smiles  MaxAbsEStateIndex  MaxEStateIndex  MinAbsEStateIndex  \
0    CCO           6.520833        6.520833           2.854167   
1   CCCC           7.005208        7.005208           3.314583   
2    CCN           6.687500        6.687500           0.458333   
3  O=C=O           8.125000        8.125000           0.250000   
4    C#N           6.986111        6.986111           1.000000   

   MinEStateIndex       qed   MolWt  HeavyAtomMolWt  ExactMolWt  \
0       -2.880208  0.406808  46.069          40.021   46.041865   
1       -3.411458  0.431024  58.124          48.044   58.078250   
2       -2.947917  0.406237  45.085          38.029   45.057849   
3        0.250000  0.364167  44.009          44.009   43.989829   
4        1.000000  0.369797  27.026          26.018   27.010899   

   NumValenceElectrons  ...  fr_sulfide  fr_sulfonamd  fr_sulfone  \
0                   20  ...           0             0           0   
1                   26  ...           0             0         

In [6]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return descriptors

def RDkit_descriptors_parallel(df, smiles_column='smiles', num_workers=4, batch_size=100000):
    total_smiles = len(df)
    print(f"Total SMILES in DataFrame: {total_smiles}")

    all_descriptors = []
    for i in range(0, total_smiles, batch_size):
        batch_df = df.iloc[i:i + batch_size]
        smiles_list = batch_df[smiles_column].tolist()

        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            results = list(executor.map(compute_descriptors, smiles_list))

        # Filter out None values (invalid molecules)
        valid_results = [res for res in results if res is not None]

        if not valid_results:
            print(f"No valid molecules found in batch {i // batch_size + 1}.")
            continue

        desc_names = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]).GetDescriptorNames()
        descriptors_df = pd.DataFrame(valid_results, columns=desc_names)
        all_descriptors.append(descriptors_df)

        print(f"Processed batch {i // batch_size + 1} ({min(i + batch_size, total_smiles)} out of {total_smiles} SMILES).")

    # Concatenate all descriptor DataFrames
    final_descriptors_df = pd.concat(all_descriptors, ignore_index=True)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, final_descriptors_df], axis=1)

    return df_with_descriptors

# Example usage
df = pd.DataFrame({'smiles': ['CCO', 'CCCC', 'CCN', 'O=C=O', 'C#N'] * 20000})  # Sample DataFrame with 100,000 SMILES
result = RDkit_descriptors_parallel(df)
print(result)


Total SMILES in DataFrame: 100000


KeyboardInterrupt: 

In [8]:
print(len(df))

100000


In [9]:
import time
import multiprocessing

def compute_descriptors(row):
    smiles = row["SMILES"]
    mol_id = row["ID"]
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return mol_id, smiles, descriptors

def process_batch(batch):
    with multiprocessing.Pool() as pool:
        return pool.map(compute_descriptors, batch.iterrows())

def main(df, batch_size=10000):
    df_len = len(df)
    print(f"There are {df_len} SMILES")
    total_batches = (df_len - 1) // batch_size + 1

    all_descriptors = []
    for batch_num, i in enumerate(range(0, df_len, batch_size), start=1):
        time_start = time.time()
        print(f"Processing batch {batch_num} of {total_batches}.")
        
        batch = df.iloc[i:i + batch_size]
        results = process_batch(batch)
        
        for result in results:
            if result is not None:
                all_descriptors.append(result[2])

        time_end = time.time()
        print(f"Processing time: {time_end - time_start:.2f} s for batch {batch_num} of {total_batches}")
    
    desc_names = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList]).GetDescriptorNames()
    descriptors_df = pd.DataFrame(all_descriptors, columns=desc_names)
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)
    
    return df_with_descriptors

# Example usage
df = pd.DataFrame({'SMILES': ['CCO', 'CCCC', 'CCN', 'O=C=O', 'C#N'] * 20000, 'ID': range(100000)})
result = main(df)
print(result)


There are 100000 SMILES
Processing batch 1 of 10.


Process SpawnPoolWorker-3:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'compute_descriptors' on <module '__main__' (built-in)>
Process SpawnPoolWorker-2:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/opt/

Process SpawnPoolWorker-15:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'compute_descriptors' on <module '__main__' (built-in)>
Process SpawnPoolWorker-16:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/op

Process SpawnPoolWorker-26:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'compute_descriptors' on <module '__main__' (built-in)>
Process SpawnPoolWorker-27:
Traceback (most recent call last):
  File "/Users/lucaskaras/opt/miniconda3/envs/ullmann_project/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/lucaskaras/op

KeyboardInterrupt: 

In [None]:
import time
import multiprocessing

def compute_descriptors(row):
    smiles = row["SMILES"]
    mol_id = row["ID"]
    
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None

    mol = Chem.AddHs(mol)
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    descriptors = calc.CalcDescriptors(mol)
    return smiles, mol_id, descriptors

def process_batch(batch):
    with multiprocessing.Pool() as p:
        return p.starmap(compute_descriptors, [(row) for row in batch])

Mol_descriptors = []

def main(df, batch_size=10000):
    df_len = len(df)
    print(f"There are {len(df)} SMILES")
    total_batches = (smiles_len - 1) // batch_size + 1

    for batch_num, i in enumerate(range(1, df_len, batch_size), start=1):
        time_start = time.time()
        print(f"Processing batch {batch_num} of {total_batches}.")
        
        batch = df[i:i + batch_size]
        results = process_batch(batch)
        
        for smiles, mol_id, descriptors in results:
            Mol_descriptors.append(descriptors)

        time_end = time.time()
        print(f"Processing time: {time_end - time_start:.2f} s for {batch_num} of {total_batches}")
    
    descriptors_df = pd.DataFrame(Mol_descriptors, columns=desc_names)
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)
    
    return df_with_descriptors

In [None]:
def RDkit_descriptors(df, smiles_column='smiles'):
    mols = [Chem.MolFromSmiles(smiles) for smiles in df[smiles_column] if Chem.MolFromSmiles(smiles) is not None]
    
    if not mols:
        print("No valid molecules found.")
        return None

    calc = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for mol in mols:
        mol = Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)

    descriptors_df = pd.DataFrame(Mol_descriptors, columns=desc_names)
    
    # Concatenate the original DataFrame with the descriptors DataFrame
    df_with_descriptors = pd.concat([df, descriptors_df], axis=1)

    return df_with_descriptors