In [4]:
import pickle
from random import shuffle, seed
import numpy as np

In [2]:
seed(13)

file_to_read = open("Cleaner NIST Dataset.pickle", "rb")
d = pickle.load(file_to_read)
smiles = d["smiles"]
sequences = d["sequences"]
#Zip each data sequence
dataset = list(zip(smiles, sequences))
shuffle(dataset)

#Extract compounds that occur more than once so that repeats aren't distributed across folds
single_occurence_molecules = [x for x in dataset if list(d["smiles"]).count(x[0]) <= 1]
multiple_occurence_molecules = [x for x in dataset if x[0] not in [h[0] for h in single_occurence_molecules]]

#Create folds
folds = {}
fold_size = len(single_occurence_molecules) // 5
for i in range(1, 6):
    folds[i] = single_occurence_molecules[((i - 1) * fold_size):(i * fold_size)]
#Add whatever wasn't added from single occurences to the end of multiple occurences
multiple_occurence_molecules += single_occurence_molecules[(5 * fold_size):]
mult_fold_size = len(multiple_occurence_molecules) // 5
#Add all these molecules across folds such that all repeat occurences always occur within the same fold
current_fold = 0
while(len(multiple_occurence_molecules) > 0):
    current_fold %= 5
    current_fold += 1
    current_molecule = multiple_occurence_molecules[0]
    while current_molecule[0] in [h[0] for h in multiple_occurence_molecules]:
        folds[current_fold].append(multiple_occurence_molecules.pop([h[0] for h in multiple_occurence_molecules].index(current_molecule[0])))

In [5]:
def normalize(s):
    """Normalize the input series from 0->1 and return it"""
    maxval = max(s)
    scale = 1 / maxval
    if(maxval == 0):
      scale = 0
    return([j * scale for j in s])

def normal_many(x):
    return([floor_out(normalize(j)) for j in x])

def floor_out(x):
  return([j if j > 0.01 else 0 for j in x])

dataset_splits = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}}
for i in range(1, 6):
  #For each i-th split, the testing set will be the i-th fold
  test = folds[i]
  train = []
  for x in range(1, 6):
    if x != i:
      train += folds[x]
  
  dataset_splits[i]["test_smiles"] = np.array([j[0] for j in test])
  dataset_splits[i]["test_y"] = np.array(normal_many([j[1] for j in test]), dtype = float)
  dataset_splits[i]["train_smiles"] = np.array([j[0] for j in train])
  dataset_splits[i]["train_y"] = np.array(normal_many([j[1] for j in train]), dtype = float)

