# Getting 3D descriptors

This notebook gets 3D molecular descriptors using both rdMolDescriptors (https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html) and Descriptors3D (https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors3D.html)

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdMolDescriptors

from sklearn.preprocessing import StandardScaler
import pandas as pd

## Create 3D Descriptors

In [2]:
#Load preprocessed data
input_file = '../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)

#Function to calculate all required 3D descriptors
def calculate_3d_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles) #transform smiles to molecular representation

    # Add hydrogens and generate 3D conformers
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.MMFFOptimizeMolecule(mol)
    
    descriptors = {}
    # 3D descriptors from Descriptors3D module
    descriptors['PMI1'] = Descriptors3D.PMI1(mol)
    descriptors['PMI2'] = Descriptors3D.PMI2(mol)
    descriptors['PMI3'] = Descriptors3D.PMI3(mol)
    descriptors['Asphericity'] = Descriptors3D.Asphericity(mol)
    descriptors['Eccentricity'] = Descriptors3D.Eccentricity(mol)
    descriptors['InertialShapeFactor'] = Descriptors3D.InertialShapeFactor(mol)
    descriptors['NPR1'] = Descriptors3D.NPR1(mol)
    descriptors['NPR2'] = Descriptors3D.NPR2(mol)
    descriptors['PBF'] = Descriptors3D.PBF(mol)
    descriptors['RadiusOfGyration'] = Descriptors3D.RadiusOfGyration(mol)
    descriptors['SpherocityIndex'] = Descriptors3D.SpherocityIndex(mol)

    # Additional 3D descriptors from rdMolDescriptors module
    descriptors['AUTOCORR3D'] = rdMolDescriptors.CalcAUTOCORR3D(mol)
    #descriptors['CoulombMat'] = rdMolDescriptors.CalcCoulombMat(mol)
    #descriptors['EEMcharges'] = rdMolDescriptors.CalcEEMcharges(mol)
    descriptors['GETAWAY'] = rdMolDescriptors.CalcGETAWAY(mol)
    descriptors['LabuteASA'] = rdMolDescriptors.CalcLabuteASA(mol)
    descriptors['MORSE'] = rdMolDescriptors.CalcMORSE(mol)
    descriptors['RDF'] = rdMolDescriptors.CalcRDF(mol)
    descriptors['WHIM'] = rdMolDescriptors.CalcWHIM(mol)
    descriptors['DoubleCubicLatticeVolume'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetVolume()
    descriptors['DoubleCubicLatticeSurfaceArea'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetSurfaceArea()
    descriptors['DoubleCubicLatticeVDWVolume'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetVDWVolume()
    descriptors['DoubleCubicLatticePackingDensity'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetPackingDensity()

    return descriptors

# Apply descriptor calculation to each molecule
descriptor_list = df['Smiles'].apply(calculate_3d_descriptors)

# Convert the list of descriptor dictionaries to a DataFrame
descriptor_df = pd.DataFrame(descriptor_list.tolist())

# Concatenate the original dataframe with the descriptor dataframe
result_df = pd.concat([df[['Molecule ChEMBL ID']], descriptor_df], axis=1)


## Process descriptors in list or matrix format

In [3]:
### 'GETAWAY', 'WHIM', 'MORSE', 'RDF', and 'AUTOCORR3D' all return lists of descriptors. Need to split into separate columns

#Function to split descriptor lists into individual columns
def split_descriptor_columns(df, descriptor_name):
    cols = [f'{descriptor_name}_{i}' for i in range(len(df.iloc[0][descriptor_name]))]  # Create column names
    descriptor_split = pd.DataFrame(df[descriptor_name].to_list(), columns=cols)  # Split list into columns
    return pd.concat([df, descriptor_split], axis=1)  # Concatenate with the original dataframe

result_processed_df = split_descriptor_columns(result_df, 'GETAWAY') #Process GETAWAY, returns 273 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'WHIM') #Process WHIM, returns 114 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'MORSE') #Process MORSE, returns 224 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'RDF') #Process RDF, returns 210 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'AUTOCORR3D') #Process AUTOCORR3D, returns 80 descriptors

# Drop the original columns after splitting
result_processed_df.drop(columns=['GETAWAY', 'WHIM', 'MORSE', 'RDF', 'AUTOCORR3D'], inplace=True)

# Save the results to a new CSV
output_file = '3D-descriptors.csv'
result_processed_df.to_csv(output_file, index=False)

## Standardize the descriptors

In [4]:
# Identify and exclude list-based columns
result_scaled_df = result_processed_df.copy()
scalar_columns = [col for col in result_scaled_df.columns if 'Molecule ChEMBL ID' not in col 
                                                             and not isinstance(result_scaled_df[col].iloc[0], list)]

# Apply StandardScaler to scalar descriptors
scaler = StandardScaler()
result_scaled_df[scalar_columns] = scaler.fit_transform(result_scaled_df[scalar_columns])

# Output the scaled dataframe
result_scaled_df.head()
output_file = '3D-descriptors-standardized.csv'
result_scaled_df.to_csv(output_file, index=False)