In [1]:
from rdkit import Chem
from rdkit.DataStructs import ConvertToNumpyArray
from rdkit.Chem import AllChem 
from rdkit.Chem import Draw  
from rdkit.Chem import Descriptors  
from rdkit.Chem import rdMolDescriptors  
from rdkit.Chem import MACCSkeys 
from rdkit.Chem import Crippen
from rdkit.Chem import PandasTools  
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("/Users/u1536635/Downloads/notebooks/high_qed_coconut.csv")
df

Unnamed: 0,COCONUT_ID,SMILES,NPLikeness,QED_Drug_Likeness,SyntheticAccessibilityScore,fingerprints
0,CNP0138776.1,C=C(C(=O)O)[C@@H]1CC=C2CC[C@H](O)[C@H](C)[C@@]...,2.959673,0.584990,4.393565,0000000000000000000000000000000001001000000100...
1,CNP0169714.2,C/C1=C\CC[C@]2(C)O[C@H]2/C=C(/C(C)C)C(=O)C/C(C...,2.588206,0.494854,4.633424,0100000100000000000000000100100001001000000000...
2,CNP0478801.0,CC(C)=CCCC(C)=CCC=CC(C)(O)CCC=C(C)C,2.394389,0.506873,3.689921,0000000000000000000000000000100001000010000000...
3,CNP0247812.3,CC1=CC(=O)[C@H](O)[C@]2(C)CC[C@@H]3[C@H](OC(=O...,3.089214,0.672931,4.537491,0000100000000000000000000000000001001000000000...
4,CNP0508026.1,CC(C)=CCC/C(C)=C/COC1=CC=C2C(=O)[C@H](O)[C@@H]...,2.048829,0.434478,3.583235,0000000000000000000000000010100001000010000000...
...,...,...,...,...,...,...
65718,CNP0178353.1,C=C[C@]1(C)C=C2C(=O)C[C@H]3[C@@](C)(CO)CCC[C@]...,3.372805,0.772491,4.607654,0000100000000000000000000000000001001001000000...
65719,CNP0112585.1,CC(C)=CC/C=C(\C)[C@H]1C/C=C(/C)CC/C=C(/C(=O)O)...,2.155840,0.518509,4.175737,0000100000000000000000000100000001000000000100...
65720,CNP0410022.3,C=CC[C@@H]1C[C@@]2(O)C[C@H](C(C)(C)O)OC2=CC1=O,2.892480,0.743535,4.740360,0000100000000000000000000000000001001000000000...
65721,CNP0115213.1,C=C1C(=O)O[C@H]2[C@H]1[C@@H](OC(C)=O)C[C@@](C)...,3.494544,0.547436,4.747529,0000000101010000000000000001001001011000000000...


In [3]:
def calculate_descriptors(smiles):
    # Convert SMILES to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    # Calculate standard RDKit descriptors
    descriptors = {
        "MolecularWeight": Descriptors.MolWt(mol),
        "LogP": Crippen.MolLogP(mol),
        "TPSA": Descriptors.TPSA(mol),
        "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
        "NumAromaticRings": Descriptors.NumAromaticRings(mol),
        "HBondDonors": Descriptors.NumHDonors(mol),
        "HBondAcceptors": Descriptors.NumHAcceptors(mol),
        "HeavyAtomCount": Descriptors.HeavyAtomCount(mol),
        "MolFractionCSP3": Descriptors.FractionCSP3(mol),
        "RingCount": Descriptors.RingCount(mol),
        "Chi0": Descriptors.Chi0(mol),  # Information about shape
        "Chi1": Descriptors.Chi1(mol),  # Shape index
    }
    return descriptors

In [4]:
descriptors_list = []

for smiles in df['SMILES']:
    descriptors_list.append(calculate_descriptors(smiles))

# Convert the list of dictionaries to a DataFrame
descriptors_df = pd.DataFrame(descriptors_list)

# Concatenate with the original DataFrame
# df = pd.concat([df, descriptors_df], axis=1)

In [5]:
descriptors_df

Unnamed: 0,MolecularWeight,LogP,TPSA,NumRotatableBonds,NumAromaticRings,HBondDonors,HBondAcceptors,HeavyAtomCount,MolFractionCSP3,RingCount,Chi0,Chi1
0,250.338,2.76070,57.53,2,0,2,2,18,0.666667,2,13.499636,8.358896
1,302.458,5.15210,29.60,1,0,0,2,22,0.650000,2,16.328063,10.315186
2,290.491,6.12280,20.23,9,0,1,1,21,0.600000,0,16.303119,9.726851
3,264.321,1.47030,63.60,0,0,1,4,19,0.733333,3,13.947229,8.858896
4,424.493,4.84660,96.22,7,2,3,6,31,0.320000,3,22.543241,14.756928
...,...,...,...,...,...,...,...,...,...,...,...,...
65718,302.458,4.29290,37.30,2,0,1,2,22,0.750000,3,16.173362,10.278569
65719,370.577,7.55310,37.30,4,0,1,1,27,0.560000,1,20.233840,12.790601
65720,252.310,1.32630,66.76,3,0,2,4,18,0.642857,2,13.552042,8.259985
65721,324.373,0.80370,93.06,1,0,2,6,23,0.764706,3,17.101930,10.630735


In [6]:
df_desc = pd.concat([df, descriptors_df], axis=1)
df_desc

Unnamed: 0,COCONUT_ID,SMILES,NPLikeness,QED_Drug_Likeness,SyntheticAccessibilityScore,fingerprints,MolecularWeight,LogP,TPSA,NumRotatableBonds,NumAromaticRings,HBondDonors,HBondAcceptors,HeavyAtomCount,MolFractionCSP3,RingCount,Chi0,Chi1
0,CNP0138776.1,C=C(C(=O)O)[C@@H]1CC=C2CC[C@H](O)[C@H](C)[C@@]...,2.959673,0.584990,4.393565,0000000000000000000000000000000001001000000100...,250.338,2.76070,57.53,2,0,2,2,18,0.666667,2,13.499636,8.358896
1,CNP0169714.2,C/C1=C\CC[C@]2(C)O[C@H]2/C=C(/C(C)C)C(=O)C/C(C...,2.588206,0.494854,4.633424,0100000100000000000000000100100001001000000000...,302.458,5.15210,29.60,1,0,0,2,22,0.650000,2,16.328063,10.315186
2,CNP0478801.0,CC(C)=CCCC(C)=CCC=CC(C)(O)CCC=C(C)C,2.394389,0.506873,3.689921,0000000000000000000000000000100001000010000000...,290.491,6.12280,20.23,9,0,1,1,21,0.600000,0,16.303119,9.726851
3,CNP0247812.3,CC1=CC(=O)[C@H](O)[C@]2(C)CC[C@@H]3[C@H](OC(=O...,3.089214,0.672931,4.537491,0000100000000000000000000000000001001000000000...,264.321,1.47030,63.60,0,0,1,4,19,0.733333,3,13.947229,8.858896
4,CNP0508026.1,CC(C)=CCC/C(C)=C/COC1=CC=C2C(=O)[C@H](O)[C@@H]...,2.048829,0.434478,3.583235,0000000000000000000000000010100001000010000000...,424.493,4.84660,96.22,7,2,3,6,31,0.320000,3,22.543241,14.756928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65718,CNP0178353.1,C=C[C@]1(C)C=C2C(=O)C[C@H]3[C@@](C)(CO)CCC[C@]...,3.372805,0.772491,4.607654,0000100000000000000000000000000001001001000000...,302.458,4.29290,37.30,2,0,1,2,22,0.750000,3,16.173362,10.278569
65719,CNP0112585.1,CC(C)=CC/C=C(\C)[C@H]1C/C=C(/C)CC/C=C(/C(=O)O)...,2.155840,0.518509,4.175737,0000100000000000000000000100000001000000000100...,370.577,7.55310,37.30,4,0,1,1,27,0.560000,1,20.233840,12.790601
65720,CNP0410022.3,C=CC[C@@H]1C[C@@]2(O)C[C@H](C(C)(C)O)OC2=CC1=O,2.892480,0.743535,4.740360,0000100000000000000000000000000001001000000000...,252.310,1.32630,66.76,3,0,2,4,18,0.642857,2,13.552042,8.259985
65721,CNP0115213.1,C=C1C(=O)O[C@H]2[C@H]1[C@@H](OC(C)=O)C[C@@](C)...,3.494544,0.547436,4.747529,0000000101010000000000000001001001011000000000...,324.373,0.80370,93.06,1,0,2,6,23,0.764706,3,17.101930,10.630735


In [None]:
fingerprints_matrix = []
for fp in df['fingerprints']:
    if fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        fingerprints_matrix.append(arr)
    else:
        fingerprints_matrix.append(np.zeros((1,)))

fingerprints_matrix = np.array(fingerprints_matrix)

# Perform K-Means clustering (you can adjust n_clusters)
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(fingerprints_matrix)

In [8]:
df_desc.to_csv("/Users/u1536635/Downloads/notebooks/high_qed_coconut_desc.csv", index=None)