In [None]:
import os
import csv
import networkx as nx

from random import shuffle

from IPython.display import display, SVG

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Draw

from dataset import get_dataset, preprocess

In [None]:
DATA=os.getenv('DATA')
ARTIFACTS=os.getenv('ARTIFACTS')

this_name = 'exaLearnMol'
data_path = os.path.join(DATA, this_name)
artifact_path = os.path.join(ARTIFACTS, this_name)

In [None]:
data_filepath = get_dataset.main(data_path)
logp, smiles = preprocess.main(data_filepath)

print("{} data points".format(len(logp)))

In [None]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   symbol=atom.GetSymbol(),
                   formal_charge=atom.GetFormalCharge(),
                   implicit_valence=atom.GetImplicitValence(),
                   ring_atom=atom.IsInRing(),
                   degree=atom.GetDegree(),
                   hybridization=atom.GetHybridization())
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return G

In [None]:
idx = 9433
lp = logp[idx]
sm = smiles[idx]
print(lp, sm)

mol = Chem.MolFromSmiles(sm)

In [None]:
sample_idx = [i for i in range(len(logp))]
shuffle(sample_idx)

indices = [0, 100000, 200000, 240000, 249000, 249452]
# indices = sample_idx[:20]


my_molecules = [Chem.MolFromSmiles(smiles[i]) for i in indices]
my_logp = [str(logp[i]) for i in indices]

Draw.MolsToGridImage(my_molecules, subImgSize=(300, 300), molsPerRow=3, legends=my_logp, useSVG=False)
