Getting physico chemical descriptors
This notebook gets physicochemical descriptors using Descriptors

In [4]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler

In [2]:
#load preprocessed data
input_file = '../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv'
df = pd.read_csv(input_file)

#Function to calculate physicochemical descriptors
def calculate_physicochemical_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)  #transform smiles to molecular representation

    descriptors = {}
    descriptors['mw']=Descriptors.MolWt(mol)
    descriptors['mw_H']=Descriptors.HeavyAtomMolWt(mol)
    descriptors['qed']=Descriptors.qed(mol)
    descriptors['max_charge']=Descriptors.MaxPartialCharge(mol)
    descriptors['min_charge']=Descriptors.MinPartialCharge(mol)
    descriptors['max_abs_charge']=Descriptors.MaxAbsPartialCharge(mol)
    descriptors['min_abs_charge']=Descriptors.MinAbsPartialCharge(mol)
    descriptors['aliphaticCarbocycles']=Descriptors.NumAliphaticCarbocycles(mol)
    descriptors['aliphaticHeterocycles']=Descriptors.NumAliphaticHeterocycles(mol)
    descriptors['aliphaticRings']=Descriptors.NumAliphaticRings(mol)
    descriptors['aromaticCarbocycles']=Descriptors.NumAromaticCarbocycles(mol)
    descriptors['aromaticHeterocycles']=Descriptors.NumAromaticHeterocycles(mol)
    descriptors['aromaticRings']=Descriptors.NumAromaticRings(mol)
    descriptors['HAcceptors']=Descriptors.NumHAcceptors(mol)
    descriptors['HDonors']=Descriptors.NumHDonors(mol)
    descriptors['heteroatoms']=Descriptors.NumHeteroatoms(mol)
    descriptors['rotatableBonds']=Descriptors.NumRotatableBonds(mol)
    descriptors['saturatedCarbocycles']=Descriptors.NumSaturatedCarbocycles(mol)
    descriptors['saturatedHeterocycles']=Descriptors.NumSaturatedHeterocycles(mol)
    descriptors['satureatedRings']=Descriptors.NumSaturatedRings(mol)
    descriptors['ringCount']=Descriptors.RingCount(mol)
    descriptors['molLogP']=Descriptors.MolLogP(mol)
    descriptors['molMR']=Descriptors.MolMR(mol)
    
    return descriptors



# Apply descriptor calculation to each molecule
descriptor_list = df['Smiles'].apply(calculate_physicochemical_descriptors)

# Convert the list of descriptor dictionaries to a DataFrame
descriptor_df = pd.DataFrame(descriptor_list.tolist())

# Concatenate the original dataframe with the descriptor dataframe
result_df = pd.concat([df[['Molecule ChEMBL ID']], descriptor_df], axis=1)
# Save the results to a new CSV
output_file = 'physicochemical-descriptors.csv'
result_df.to_csv(output_file, index=False)

In [6]:
# Identify and exclude list-based columns
scalar_columns = [col for col in result_df.columns if 'Molecule ChEMBL ID' not in col 
                                                             and not isinstance(result_df[col].iloc[0], list)]

# Apply StandardScaler to scalar descriptors
scaler = StandardScaler()
result_df[scalar_columns] = scaler.fit_transform(result_df[scalar_columns])

# Output the scaled dataframe
result_df.to_csv('physicochemical-descriptors-standardized.csv')