In [2]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

# Load your dataset
df = pd.read_csv("merged_pubchem_properties.csv")
smiles_list = df["SMILES"].dropna().tolist()

# Set up descriptor list
descriptor_names = [desc[0] for desc in Descriptors._descList]
descriptor_funcs = [desc[1] for desc in Descriptors._descList]

# Function to compute descriptors safely
def compute_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(descriptor_funcs)

    values = []
    for func in descriptor_funcs:
        try:
            val = func(mol)
        except Exception:
            val = None
        values.append(val)
    return values

# Compute descriptors with progress tracking
descriptor_data = []
for i, sm in enumerate(smiles_list):
    row = compute_rdkit_descriptors(sm)
    descriptor_data.append(row)

    if i % 100 == 0:
        print(f"...processed {i} SMILES")

# Convert to DataFrame
rdkit_df = pd.DataFrame(descriptor_data, columns=descriptor_names)
rdkit_df["SMILES"] = smiles_list

# Merge with original data
df_descriptors = pd.merge(df, rdkit_df, on="SMILES", how="inner")
df_descriptors.to_csv("phytochemicals_with_rdkit_descriptors.csv", index=False)
print("✅ RDKit descriptor matrix complete and saved.")


...processed 0 SMILES
...processed 100 SMILES
...processed 200 SMILES
...processed 300 SMILES
...processed 400 SMILES
...processed 500 SMILES
...processed 600 SMILES
...processed 700 SMILES
...processed 800 SMILES




...processed 900 SMILES
...processed 1000 SMILES
...processed 1100 SMILES
...processed 1200 SMILES
...processed 1300 SMILES
...processed 1400 SMILES
...processed 1500 SMILES
...processed 1600 SMILES
...processed 1700 SMILES
...processed 1800 SMILES
...processed 1900 SMILES
...processed 2000 SMILES
...processed 2100 SMILES
...processed 2200 SMILES
...processed 2300 SMILES
...processed 2400 SMILES
...processed 2500 SMILES
...processed 2600 SMILES
...processed 2700 SMILES
...processed 2800 SMILES
...processed 2900 SMILES
...processed 3000 SMILES
...processed 3100 SMILES
...processed 3200 SMILES
...processed 3300 SMILES
...processed 3400 SMILES
...processed 3500 SMILES
...processed 3600 SMILES
...processed 3700 SMILES
...processed 3800 SMILES
...processed 3900 SMILES
...processed 4000 SMILES
...processed 4100 SMILES
...processed 4200 SMILES
...processed 4300 SMILES
...processed 4400 SMILES
...processed 4500 SMILES
...processed 4600 SMILES
...processed 4700 SMILES
...processed 4800 SMILES
.



...processed 7700 SMILES
...processed 7800 SMILES
...processed 7900 SMILES
...processed 8000 SMILES
...processed 8100 SMILES
...processed 8200 SMILES
...processed 8300 SMILES
...processed 8400 SMILES
...processed 8500 SMILES
...processed 8600 SMILES
...processed 8700 SMILES
...processed 8800 SMILES
...processed 8900 SMILES
...processed 9000 SMILES
...processed 9100 SMILES
...processed 9200 SMILES
✅ RDKit descriptor matrix complete and saved.


In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import pandas as pd

# Load your descriptor-merged dataset
df_descriptors = pd.read_csv("phytochemicals_with_rdkit_descriptors.csv")
smiles_list = df_descriptors["SMILES"].dropna().tolist()

# Function to convert SMILES to Morgan fingerprint
def smiles_to_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        return np.array(fp)
    except:
        return None

# Generate fingerprints with progress checks
fps = []
valid_smiles = []

for i, sm in enumerate(smiles_list):
    fp = smiles_to_fingerprint(sm)
    if fp is not None:
        fps.append(fp)
        valid_smiles.append(sm)
    
    if i % 100 == 0:
        print(f"...fingerprinted {i} SMILES")

# Convert to DataFrame
fp_matrix = np.vstack(fps)
fp_df = pd.DataFrame(fp_matrix, columns=[f"FP_{j}" for j in range(fp_matrix.shape[1])])
fp_df["SMILES"] = valid_smiles

# Merge fingerprints with descriptor data
df_full = pd.merge(df_descriptors, fp_df, on="SMILES", how="inner")

# Save full dataset
df_full.to_csv("phytochemicals_full_features.csv", index=False)
print("✅ Fingerprint + descriptor features saved to 'phytochemicals_full_features.csv'")




...fingerprinted 0 SMILES
...fingerprinted 100 SMILES
...fingerprinted 200 SMILES




...fingerprinted 300 SMILES
...fingerprinted 400 SMILES
...fingerprinted 500 SMILES
...fingerprinted 600 SMILES




...fingerprinted 700 SMILES
...fingerprinted 800 SMILES
...fingerprinted 900 SMILES
...fingerprinted 1000 SMILES




...fingerprinted 1100 SMILES
...fingerprinted 1200 SMILES
...fingerprinted 1300 SMILES
...fingerprinted 1400 SMILES




...fingerprinted 1500 SMILES
...fingerprinted 1600 SMILES
...fingerprinted 1700 SMILES
...fingerprinted 1800 SMILES
...fingerprinted 1900 SMILES




...fingerprinted 2000 SMILES
...fingerprinted 2100 SMILES
...fingerprinted 2200 SMILES
...fingerprinted 2300 SMILES




...fingerprinted 2400 SMILES
...fingerprinted 2500 SMILES
...fingerprinted 2600 SMILES
...fingerprinted 2700 SMILES
...fingerprinted 2800 SMILES




...fingerprinted 2900 SMILES
...fingerprinted 3000 SMILES
...fingerprinted 3100 SMILES
...fingerprinted 3200 SMILES




...fingerprinted 3300 SMILES
...fingerprinted 3400 SMILES
...fingerprinted 3500 SMILES
...fingerprinted 3600 SMILES




...fingerprinted 3700 SMILES
...fingerprinted 3800 SMILES
...fingerprinted 3900 SMILES




...fingerprinted 4000 SMILES
...fingerprinted 4100 SMILES
...fingerprinted 4200 SMILES
...fingerprinted 4300 SMILES
...fingerprinted 4400 SMILES




...fingerprinted 4500 SMILES
...fingerprinted 4600 SMILES
...fingerprinted 4700 SMILES




...fingerprinted 4800 SMILES
...fingerprinted 4900 SMILES
...fingerprinted 5000 SMILES




...fingerprinted 5100 SMILES
...fingerprinted 5200 SMILES
...fingerprinted 5300 SMILES
...fingerprinted 5400 SMILES




...fingerprinted 5500 SMILES
...fingerprinted 5600 SMILES
...fingerprinted 5700 SMILES
...fingerprinted 5800 SMILES




...fingerprinted 5900 SMILES
...fingerprinted 6000 SMILES
...fingerprinted 6100 SMILES
...fingerprinted 6200 SMILES
...fingerprinted 6300 SMILES




...fingerprinted 6400 SMILES
...fingerprinted 6500 SMILES
...fingerprinted 6600 SMILES
...fingerprinted 6700 SMILES




...fingerprinted 6800 SMILES
...fingerprinted 6900 SMILES
...fingerprinted 7000 SMILES




...fingerprinted 7100 SMILES
...fingerprinted 7200 SMILES
...fingerprinted 7300 SMILES




...fingerprinted 7400 SMILES
...fingerprinted 7500 SMILES
...fingerprinted 7600 SMILES
...fingerprinted 7700 SMILES




...fingerprinted 7800 SMILES
...fingerprinted 7900 SMILES
...fingerprinted 8000 SMILES
...fingerprinted 8100 SMILES




...fingerprinted 8200 SMILES
...fingerprinted 8300 SMILES
...fingerprinted 8400 SMILES
...fingerprinted 8500 SMILES




...fingerprinted 8600 SMILES
...fingerprinted 8700 SMILES
...fingerprinted 8800 SMILES
...fingerprinted 8900 SMILES




...fingerprinted 9000 SMILES
...fingerprinted 9100 SMILES
...fingerprinted 9200 SMILES
...fingerprinted 9300 SMILES




...fingerprinted 9400 SMILES
...fingerprinted 9500 SMILES
...fingerprinted 9600 SMILES
...fingerprinted 9700 SMILES




...fingerprinted 9800 SMILES
...fingerprinted 9900 SMILES
...fingerprinted 10000 SMILES
...fingerprinted 10100 SMILES
...fingerprinted 10200 SMILES




...fingerprinted 10300 SMILES
...fingerprinted 10400 SMILES
...fingerprinted 10500 SMILES




...fingerprinted 10600 SMILES
...fingerprinted 10700 SMILES
...fingerprinted 10800 SMILES
...fingerprinted 10900 SMILES
...fingerprinted 11000 SMILES




...fingerprinted 11100 SMILES
...fingerprinted 11200 SMILES
...fingerprinted 11300 SMILES




...fingerprinted 11400 SMILES
...fingerprinted 11500 SMILES
...fingerprinted 11600 SMILES
...fingerprinted 11700 SMILES
...fingerprinted 11800 SMILES




...fingerprinted 11900 SMILES
✅ Fingerprint + descriptor features saved to 'phytochemicals_full_features.csv'
