### Imports

In [1]:
from rdkit import Chem

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG=True

import gzip
import pickle

In [6]:
import random

Create iterators for all active and all inactive molecules.

Iterators are required because the files are zipped (`.sdf.gz`)

In [2]:
active_molecules_iterator = Chem.ForwardSDMolSupplier(gzip.open("./data/all_active_all_inactive/active.sdf.gz"))
inactive_molecules_iterator = Chem.ForwardSDMolSupplier(gzip.open("./data/all_active_all_inactive/inactive.sdf.gz"))

Create Python lists of RDKit molecule objects for future processing

In [3]:
active_mols = [m for m in active_molecules_iterator if m is not None]
inactive_mols = [m for m in inactive_molecules_iterator if m is not None]

In [4]:
print(f"len of active_mols: {len(active_mols)}")
print(f"len of inactive_mols: {len(inactive_mols)}")

len of active_mols: 366
len of inactive_mols: 59422


Save the lists as `.pickle` objects in corresponding folders for future use

In [5]:
with open("./data/0_all_active_all_inactive/all_active.pickle", "wb") as file:
    pickle.dump(active_mols, file)
    
with open("./data/0_all_active_all_inactive/all_inactive.pickle", "wb") as file2:
    pickle.dump(inactive_mols, file2)

Selecting 366 random molecules (remembering, which I choose just in case)

In [9]:
inactive_mols_dict = dict(zip( range(len(inactive_mols)), inactive_mols ))
inactive_mols_random_keys = random.sample(list(inactive_mols_dict), len(active_mols))
random_inactive_molecules = [inactive_mols_dict[k] for k in inactive_mols_random_keys]

In [12]:
with open("./data/1_all_active_random_inactive/random_inactive.pickle", "wb") as file3:
    pickle.dump(random_inactive_molecules, file3)