In [15]:
def read_smiles_from_file(pathfile: str):
    with open(pathfile) as file:
        return [line.rstrip() for line in file]

In [16]:
guacm_smiles = "/home/nobilm@usi.ch/master_thesis/guacamol/guacamol_v1_train.smiles"
smiles = read_smiles_from_file(guacm_smiles)

In [17]:
from collections import Counter
from rdkit import Chem

c = Counter()
cc = Counter()


def get_atoms_info(mols):
    atoms = set()
    max_num = 0
    for num_mol, m in enumerate(mols):
        if m.GetNumAtoms() > max_num: max_num = m.GetNumAtoms()

        atom_types = [atom.GetSymbol() for atom in m.GetAtoms()]
        bond_types = [bond.GetBondType() for bond in m.GetBonds()]

        c.update(atom_types)
        cc.update(bond_types)
        
        for atom in atom_types:
            atoms.add(atom)
        
    atom2num = {}
    for i, atomType in enumerate(atoms):
        atom2num[str(atomType)] = i

    num2atom = {v:k for k,v in atom2num.items()}
    print("TOTAL NUM OF MOLS: ", num_mol)
    return atom2num, num2atom, max_num

In [18]:
mols = [Chem.MolFromSmiles(smi) for smi in smiles]

In [19]:
atom2num, num2atom, max_num = get_atoms_info(mols)

TOTAL NUM OF MOLS:  1273103


In [20]:
for k,v in c.items(): print(k, 1/v)

C 3.805769363678407e-08
Br 1.6347081228646626e-05
N 2.521717662942678e-07
O 2.571374939894111e-07
S 1.996274950941543e-06
Cl 3.4266994716029414e-06
F 2.0046106043900974e-06
P 4.694835680751174e-05
I 0.0001226391954868776
B 0.00046948356807511736
Si 0.0005192107995846313
Se 0.0007132667617689016


In [21]:
for k,v in cc.items(): print(k, 10000000/v)

SINGLE 0.5474413794296493
AROMATIC 0.561455769468956
DOUBLE 4.079857498737284
TRIPLE 117.53505483010308
