# Getting 3D descriptors

This notebook gets 3D molecular descriptors using both rdMolDescriptors (https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html) and Descriptors3D (https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors3D.html)

In [67]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdMolDescriptors

from sklearn.preprocessing import StandardScaler
import pandas as pd

## Create 3D Descriptors

In [26]:
#Load preprocessed data
input_file = '../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)

#Function to calculate all required 3D descriptors
def calculate_3d_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles) #transform smiles to molecular representation

    # Add hydrogens and generate 3D conformers
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    
    descriptors = {}
    # 3D descriptors from Descriptors3D module
    descriptors['PMI1'] = Descriptors3D.PMI1(mol)
    descriptors['PMI2'] = Descriptors3D.PMI2(mol)
    descriptors['PMI3'] = Descriptors3D.PMI3(mol)
    descriptors['Asphericity'] = Descriptors3D.Asphericity(mol)
    descriptors['Eccentricity'] = Descriptors3D.Eccentricity(mol)
    descriptors['InertialShapeFactor'] = Descriptors3D.InertialShapeFactor(mol)
    descriptors['NPR1'] = Descriptors3D.NPR1(mol)
    descriptors['NPR2'] = Descriptors3D.NPR2(mol)
    descriptors['PBF'] = Descriptors3D.PBF(mol)
    descriptors['RadiusOfGyration'] = Descriptors3D.RadiusOfGyration(mol)
    descriptors['SpherocityIndex'] = Descriptors3D.SpherocityIndex(mol)

    # Additional 3D descriptors from rdMolDescriptors module
    descriptors['AUTOCORR3D'] = rdMolDescriptors.CalcAUTOCORR3D(mol)
    descriptors['CoulombMat'] = rdMolDescriptors.CalcCoulombMat(mol)
    descriptors['EEMcharges'] = rdMolDescriptors.CalcEEMcharges(mol)
    descriptors['GETAWAY'] = rdMolDescriptors.CalcGETAWAY(mol)
    descriptors['LabuteASA'] = rdMolDescriptors.CalcLabuteASA(mol)
    descriptors['MORSE'] = rdMolDescriptors.CalcMORSE(mol)
    descriptors['RDF'] = rdMolDescriptors.CalcRDF(mol)
    descriptors['WHIM'] = rdMolDescriptors.CalcWHIM(mol)
    descriptors['DoubleCubicLatticeVolume'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetVolume()
    descriptors['DoubleCubicLatticeSurfaceArea'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetSurfaceArea()
    descriptors['DoubleCubicLatticeVDWVolume'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetVDWVolume()
    descriptors['DoubleCubicLatticePackingDensity'] = rdMolDescriptors.DoubleCubicLatticeVolume(mol).GetPackingDensity()

    return descriptors

# Apply descriptor calculation to each molecule
descriptor_list = df['Smiles'].apply(calculate_3d_descriptors)

# Convert the list of descriptor dictionaries to a DataFrame
descriptor_df = pd.DataFrame(descriptor_list.tolist())

# Concatenate the original dataframe with the descriptor dataframe
result_df = pd.concat([df[['Molecule ChEMBL ID','Standard Value','Smiles']], descriptor_df], axis=1)


## Process descriptors in list or matrix format

In [54]:
### 'GETAWAY', 'WHIM', 'MORSE', 'RDF', and 'AUTOCORR3D' all return lists of descriptors. Need to split into separate columns

#Function to split descriptor lists into individual columns
def split_descriptor_columns(df, descriptor_name):
    cols = [f'{descriptor_name}_{i}' for i in range(len(df.iloc[0][descriptor_name]))]  # Create column names
    descriptor_split = pd.DataFrame(df[descriptor_name].to_list(), columns=cols)  # Split list into columns
    return pd.concat([df, descriptor_split], axis=1)  # Concatenate with the original dataframe

result_processed_df = split_descriptor_columns(result_df, 'GETAWAY') #Process GETAWAY, returns 273 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'WHIM') #Process WHIM, returns 114 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'MORSE') #Process MORSE, returns 224 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'RDF') #Process RDF, returns 210 descriptors
result_processed_df = split_descriptor_columns(result_processed_df, 'AUTOCORR3D') #Process AUTOCORR3D, returns 80 descriptors

# Drop the original columns after splitting
result_processed_df.drop(columns=['GETAWAY', 'WHIM', 'MORSE', 'RDF', 'AUTOCORR3D'], inplace=True)

In [65]:
### Flatten CoulombMat into a single list

# Function to first convert and then flatten Coulomb Matrix
def convert_and_flatten_coulomb_matrix(coulomb_matrix):
    converted_matrix = [list(vect) for vect in coulomb_matrix]  # Convert _vectd to list
    return [item for sublist in converted_matrix for item in sublist]  # Flatten the list

# Apply the optimized function to the dataframe
result_processed_df['CoulombMat_Flat'] = result_processed_df['CoulombMat'].apply(convert_and_flatten_coulomb_matrix)
result_processed_df.drop(columns=['CoulombMat'], inplace=True)

In [66]:
# Save the results to a new CSV
output_file = 'TRPM8-homosapien-compounds-3D-descriptors.csv'
result_df.to_csv(output_file, index=False)

## Standardize the descriptors

In [76]:
# Identify and exclude list-based columns
scalar_columns = [col for col in result_processed_df.columns if 'Molecule ChEMBL ID' not in col 
                                                             and 'Standard Value' not in col
                                                             and 'Smile' not in col
                                                             and not isinstance(result_processed_df[col].iloc[0], list)]

# Apply StandardScaler to scalar descriptors
scaler = StandardScaler()
result_processed_df[scalar_columns] = scaler.fit_transform(result_processed_df[scalar_columns])

# Output the scaled dataframe
result_processed_df.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Value,Smiles,PMI1,PMI2,PMI3,Asphericity,Eccentricity,InertialShapeFactor,NPR1,...,AUTOCORR3D_71,AUTOCORR3D_72,AUTOCORR3D_73,AUTOCORR3D_74,AUTOCORR3D_75,AUTOCORR3D_76,AUTOCORR3D_77,AUTOCORR3D_78,AUTOCORR3D_79,CoulombMat_Flat
0,CHEMBL3235962,83.0,N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...,0.112705,-0.496536,-0.310904,-0.496557,-0.117515,-0.523687,0.359149,...,-0.02821,0.103538,0.188538,0.245796,0.268679,0.411837,0.597482,0.89252,0.254089,"[53.3587073998281, 35.758163453120474, 16.1174..."
1,CHEMBL3235983,10.0,C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...,0.143056,-0.752304,-0.576962,-0.750932,-0.691936,-0.549754,0.797698,...,0.034224,0.174319,0.335353,0.329849,0.491292,0.681388,0.262094,-0.008435,-1.020654,"[36.85810519942594, 24.11129171571103, 17.2603..."
2,CHEMBL1650511,0.413,FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...,-0.504488,0.348173,0.16773,0.593045,0.883502,-0.162446,-0.78633,...,-0.090644,-0.132399,-0.168013,-0.447638,-0.751632,-0.351891,-0.247694,-0.329324,0.417518,"[97.53309975386802, 38.268797199071656, 36.413..."
3,CHEMBL2443068,230.4,O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...,-1.049042,0.861965,0.489541,1.596557,1.168956,0.575993,-1.451432,...,0.190309,0.386663,0.125617,-0.468651,-1.029898,-1.100644,-1.21361,-0.860024,-0.040083,"[73.51669471981023, 38.65501109193421, 19.0095..."
4,CHEMBL3959823,870.0,Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...,-0.240023,-1.194536,-1.194286,-1.085958,-1.995113,-0.469889,1.560771,...,-0.496465,-0.509899,-0.503591,0.014651,0.658252,0.831139,0.718221,-0.057803,-1.129607,"[36.85810519942594, 24.219014903672278, 14.535..."
