In [None]:
from rdkit.Chem import rdmolops
import os
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG

def moltosvg(mol, molSize = (300,300), kekulize = True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return svg.replace('svg:','')

def mol_with_atom_index(mol):
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(atom.GetIdx())
    return mol

def get_components(mol):
    mol_frags = rdmolops.GetMolFrags(mol, asMols = True)
    largest_mol = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
    return len(mol_frags), len(largest_mol.GetAtoms())

#mol = Chem.MolFromPDBFile("/Users/padr/repos/linking/datasets/raw/refined-set/1a1e/1a1e_pocket.pdb")
#SVG(moltosvg(mol))

#graph = mol_to_complete_graph(mol, explicit_hydrogens=False, node_featurizer=CanonicalAtomFeaturizer, edge_featurizer=CanonicalBondFeaturizer)

# Read data into huge `Data` list.
bad_data = ["1g7v", "1r1h", "2a5b", "2zjw", "1cps", "4abd"]
files_to_process = []
for path, dirs, files in os.walk(dir):
    for file in files:
        if file.endswith('protein.pdb') and not file.split("_")[0] in bad_data:
            full_path = path + os.sep + file
            files_to_process.append(full_path)

graphs = []
total = len(files_to_process)
print("Starting to process " + str(total) + " files...")
i = 0
for path in sorted(files_to_process):
    i += 1
    mol = Chem.MolFromPDBFile(path)
    print(
        "(" + str(int(100 * i / total)) + "%) File " + os.path.basename(path) + " has X fragments and the largest is of size Y: " + get_components(mol)
    )