In [1]:
from rdkit import Chem
import numpy as np  
from tqdm import tqdm
import pandas as pd

In [2]:
raw_data = pd.read_csv("NIST Gaseous IR Dataset.csv")

smiles = np.array(raw_data["SMILES"])
sequences = np.zeros((8361, 2232))
for i in range(2232):
  current_col = raw_data[str(i)]
  for x in range(8361):
    sequences[x][i] = current_col[x]

In [16]:
def not_organic(sm):
    #is a CH Bond present?
    molecule = Chem.MolFromSmiles(sm)
    molecule = Chem.AddHs(molecule)
    #iterate bonds
    for atom in molecule.GetAtoms():
        if atom.GetAtomicNum() == 6:
            for neb in atom.GetNeighbors():
                if(neb.GetAtomicNum() == 1):
                    return False
    return True
    
def too_big(sm):
    molecule = Chem.MolFromSmiles(sm)
    hydrogens = 0
    for atom in molecule.GetAtoms():
        hydrogens += atom.GetTotalNumHs()
    #return(molecule.GetNumAtoms() + hydrogens)
    return True if molecule.GetNumAtoms() >= 25 else False

def has_charged_center(sm):
    return True if "+" in sm or "-" in sm else False


too_big("C")

False

In [17]:
good_indices = []
num_charged, num_too_big, num_not_organic = 0, 0, 0
for sm in tqdm(range(len(smiles))):

    if not_organic(smiles[sm]) :
        num_not_organic += 1
        continue
    elif too_big(smiles[sm]):
        num_too_big += 1 
        continue
    elif has_charged_center(smiles[sm]):
        num_charged += 1
        continue
    else:
        good_indices.append(sm)

100%|██████████| 8362/8362 [00:04<00:00, 1788.65it/s]


In [18]:
print("Started with:", len(smiles), "samples, ended with:", len(good_indices), "samples")
print(num_not_organic, "weren't organic.")
print(num_too_big, "were too large.")
print(num_charged, "had charged centers.")

Started with: 8362 samples, ended with: 7506 samples
96 weren't organic.
120 were too large.
640 had charged centers.


In [None]:
import pickle
data = {}
data["smiles"] = smiles[good_indices]
data["sequences"] = sequences[good_indices]
file_to_store = open("Cleaner NIST Dataset.pickle", "wb+") 
pickle.dump(data, file_to_store)