## <b>Retreive descriptors</b>
This documents retreives the descriptors of the SMILES using RDKit

In [10]:
# Import all necessary modules
from rdkit import Chem
import pandas as pd
from rdkit.Chem import Descriptors

In [11]:
def get_descriptors_to_csv(csv, csv_descriptors):
    """
    Parameters:
    csv: csv file path of csv file with column 'SMILES', which is filed with strings of the type 'SMILES'
    csv_descriptors: csv file path of new file with descriptors
    return: csv file stored in "csv_descriptors"
    
    In total, we got 208 descriptors. There are two major categories: (1) physicochemical properties and (2) Fraction of a substructure (e.g., 'fr_Al_COO'). For most of the molecules, you will get a lot zeros for the 2nd category descriptors. The following code will only compute the 1st category descriptors.
    """
    data = pd.read_csv(csv)
    desc_list = [n[0] for n in Descriptors._descList]  # List with the 208 descriptors
    phc_desc = [i for i in desc_list if not i.startswith('fr_')]  # List with descriptors from category 1
    
    from rdkit.ML.Descriptors import MoleculeDescriptors
    
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(phc_desc)
    mols = [Chem.MolFromSmiles(smi) for smi in data['SMILES']]
    
    rdkit_desc_sub = [calc.CalcDescriptors(m) for m in mols]
    
    df = pd.DataFrame(rdkit_desc_sub, columns=phc_desc, index = data['SMILES'])
    csv_descriptors = df.to_csv(csv_descriptors)
    
    return csv_descriptors

In [12]:
"""
Run this cell to retrieve the RDKit data in csv format with names as used in the project
"""
get_descriptors_to_csv('Data/untested_molecules.csv', 'Data/untested_molecules_descriptors.csv')
get_descriptors_to_csv('Data/tested_molecules.csv', 'Data/tested_molecules_descriptors.csv')