# Data analysis of MNX dataset

To understand more about the reaction contained.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

In [None]:
reac = pd.read_csv('mnx_reac_bioreachable.csv')
for col in ['substrates_SMILES', 'products_SMILES']:
    reac[col] = reac[col].apply(eval)
reac

## Measure the difference between substrates and products

1. Take the largest molecule from both substrates and products
2. Compute the maximum common substructure (MCS) of the two molecules
3. Take the maximum difference in number of molecules between either substrates or products and the MCS

In [None]:
from rdkit.Chem import MolFromSmiles, MolToSmiles, rdFMCS

def numAtoms(mol):
    return len(list(mol.GetAtoms()))

In [None]:
diff_list = []
for i in trange(len(reac[:80])):

    substrates_mol = [MolFromSmiles(smi) for smi in reac.loc[i, 'substrates_SMILES']]
    products_mol = [MolFromSmiles(smi) for smi in reac.loc[i, 'products_SMILES']]

    substrate_max = max(substrates_mol, key=numAtoms)
    product_max = max(products_mol, key=numAtoms)
    # print(MolToSmiles(substrate_max), MolToSmiles(product_max))

    mcs = rdFMCS.FindMCS([substrate_max, product_max])
    diff = max(numAtoms(substrate_max) - mcs.numAtoms, numAtoms(product_max) - mcs.numAtoms)
    # print(diff)

    diff_list.append(diff)

In [None]:
plt.hist(diff_list)