In [10]:
!pip install rdkit-pypi
!pip install mordred



In [11]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
from mordred import Calculator, descriptors as mordred_descriptors
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [12]:
data = pd.read_csv('sample_qm9.csv')

In [13]:
def get_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors_dict = {}
    for desc_name, desc_func in Descriptors.descList:
        descriptors_dict[desc_name] = desc_func(mol)
    return descriptors_dict
def get_mordred_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    calc = Calculator(mordred_descriptors, ignore_3D=True)
    descs = calc(mol)
    return descs.asdict()

In [14]:
rdkit_descriptors_df = data['smiles'].apply(get_rdkit_descriptors).apply(pd.Series)

In [15]:
mordred_descriptors_df = data['smiles'].apply(get_mordred_descriptors).apply(pd.Series)

In [16]:
data_with_descriptors = pd.concat([data, rdkit_descriptors_df, mordred_descriptors_df], axis=1)

print(data_with_descriptors.head())

       mol_id           smiles        A        B        C      mu  alpha  \
0  gdb_110298  CCC1C2C(O)C2C1C  2.08389  1.40096  1.20534  1.3323  85.23   
1    gdb_2851       OCC1CC1C=O  5.87951  1.42621  1.24696  2.6127  59.30   
2   gdb_18078    CC#CC1C2COC12  4.83398  1.21834  1.13454  1.4475  73.84   
3   gdb_36538  N#CCN=C1OC2CC12  4.57418  0.90022  0.87379  5.8752  73.04   
4   gdb_91667  OC1CC2OC1C21CN1  2.56991  1.73865  1.37865  2.1139  71.36   

     homo    lumo     gap  ...      SRW10     TSRW10          MW       AMW  \
0 -0.2424  0.0680  0.3103  ...  10.002020  62.170441  126.104465  5.482803   
1 -0.2483 -0.0195  0.2288  ...   8.673342  54.073722  100.052429  6.670162   
2 -0.2224  0.0542  0.2766  ...   9.585965  59.659094  108.057515  6.753595   
3 -0.2754 -0.0059  0.2695  ...   9.599066  60.357687  122.048013  8.136534   
4 -0.2450  0.0652  0.3102  ...  10.186521  63.046892  127.063329  7.059074   

   WPath  WPol  Zagreb1  Zagreb2  mZagreb1  mZagreb2  
0     84     9     

In [17]:
data_with_descriptors.to_csv('descriptors.csv', index=False)