#### - Converts SMILES strings to Chemical Formulas
#### - Calculates Molecular Weights from Chemical Formulas
#### - Appends Values to the Dataset
#### - Example with Toxicity Dataset

# Packages

In [1]:
# Importing packages
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem import AllChem as Chem
import re

In [2]:
# Importing the Toxicity Dataset
df = pd.read_csv('data_set.csv')

## Convert SMILES to Chemical Formula

In [3]:
# Coverting "SMILES" feature to Dtype "string"
df['SMILES'] = df['SMILES'].astype("string")

In [4]:
# Function calculates chemical formula from SMILES strings

formulas = []


for smiles in df.iloc[:, -1]:
  
  if not pd.isna(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:  
      formula = Chem.rdMolDescriptors.CalcMolFormula(mol)
      formulas.append(formula)
    else:
      formulas.append("Invalid SMILES")  
  else:
    formulas.append(np.nan)  


df['formula'] = formulas

[15:03:19] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[15:03:19] Explicit valence for atom # 3 Si, 8, is greater than permitted
[15:03:19] Explicit valence for atom # 0 Cl, 2, is greater than permitted


In [5]:
# Drop rows where 'formula' is "Invalid SMILES"
df = df[df['formula'] != "Invalid SMILES"]

## Calcuate MW from Chemical Formula

In [6]:
# Function calculates MW from Chemical Formula

def get_mass(formula):
    
    parts = re.findall("[A-Z][a-z]?|[0-9]+", formula)
    mass = 0

    for index in range(len(parts)):
        if parts[index].isnumeric():
            continue

        atom = Chem.Atom(parts[index])
        multiplier = int(parts[index + 1]) if len(parts) > index + 1 and parts[index + 1].isnumeric() else 1
        mass += atom.GetMass() * multiplier

    return mass

In [7]:
# Appending MW to the dataset
df['MW'] = df['formula'].apply(get_mass)

## Final DataFrame

In [8]:
# Print df head
df.head()

Unnamed: 0,Name,Toxicity,SMILES,formula,MW
0,NCGC00260230-01,0,F[P-](F)(F)(F)(F)F.CCCC[N+]1=CC=CC(C)=C1,C10H16F6NP,295.207
1,NCGC00184995-01,1,[H][C@@]12CC[C@H](OP(O)(O)=O)[C@@]1(C)CC[C@]3(...,C23H32Cl2NO6P,520.39
2,NCGC00260471-01,0,[O-][N+](=O)C1=CC=C2NN=CC2=C1,C7H5N3O2,163.136
3,NCGC00256746-01,0,CCC1=NC=CN=C1C,C7H10N2,122.171
4,NCGC00183024-01,1,CCCN(CCC)C(=O)C(CCC(=O)OCCCN1CCN(CCOC(=O)CC2=C...,C46H58ClN5O8,844.45


## Exporting New Dataset to Desktop

In [9]:
# Exporting df to desktop as .csv
df.to_csv('new_dataset.csv')