In [3]:
from defl import * 

# change the configsetting, to see the full tables
pl.Config.set_tbl_rows(7)
pl.Config(fmt_str_lengths=550)

df = pl.read_csv("../data/LOTUS/230106_frozen_metadata_inchy_smile.csv")

In [2]:
df

id,smiles
str,str
"""XJOOMMHNYOJWCZ-UKRRQHHQSA-N""","""CC1=C[C@@H]2c3cccc4[nH]cc(c34)C[C@H]2N(C)C1"""
"""SFCYVTIQMNZUCZ-UHFFFAOYSA-N""","""C=C(C)C#Cc1cc(C=O)ccc1O"""
"""OYZXDVPSGCKVOQ-UQBPGWFLSA-N""","""COC(=O)[C@]12CCCC(C)(C)[C@@H]1CCc1cc(C(C)C)c(OC(C)=O)c(OC(C)=O)c12"""
"""MSSOSOXUURLBHN-UHFFFAOYSA-N""","""CCC(=O)OC1C2C(OC(=O)c3ccccc3)C34OC2(C)COC(=O)c2cccnc2C(C)C(C)C(=O)OC(C(O)C(OC(=O)c2ccccc2)C3(COC(C)=O)C1OC(C)=O)C4(C)O"""
…,…
"""QJFRZMZUUGXTJG-WECLKIJSSA-N""","""C[C@@H]1NC(=O)[C@@H]2[C@H](O)[C@H](O)C=NN2C(=O)COC(=O)[C@](C)(CO)NC(=O)[C@@H]2CCC=NN2C(=O)[C@H]2C[C@H](Cl)CNN2C1=O"""
"""YUSYSJSHVJULID-AATRIKPKSA-N""","""COc1ccc(/C=C/c2cc(OC)c(OC)c(OC)c2)c(O)c1O"""
"""AGJPNPDYGCAODE-JLNKQSITSA-N""","""CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCCCCCCCC(=O)c1ccc(CO)[nH]1"""


In [3]:
print(f"""
Unique over all: {df.unique().shape[0]}
Unique ID: {df["id"].unique().shape[0]}
Unique smiles: {df["smiles"].unique().shape[0]}
""")


Unique over all: 220823
Unique ID: 220823
Unique smiles: 220820



## rulesets

### metacyc generalized rules
path: "../../MINE-Database/mine_database/data/metacyc_rules/metacyc_generalized_rules.tsv"

### Enyzmatic rules
path: "../../MINE-Database/mine_database/data/original_rules/EnzymaticReactionRules.tsv"

In [2]:
df_rules_enzymatic = pl.read_csv("../../MINE-Database/mine_database/data/original_rules/EnzymaticReactionRules.tsv", separator="\t")

df_rules_enzymatic

Name,Reactants,SMARTS,Products,Comments
str,str,str,str,str
"""1.1.1_01""","""cpd00003;Any""","""[#6;H1D3:4]=[#6;H0D3:3][#6;H1D…","""cpd00067;Any;cpd00004""",
"""1.1.1_02""","""Any;cpd00003""","""[#1;D1R0:2][#8;H1D2R0:3][#6:4]…","""cpd00067;cpd00011;Any;cpd00004""",
"""1.1.1_02.rev""","""Any;cpd00004;cpd00011;cpd00067""","""[#1;D1R0:1][#6:4][#6:3]=[#8;H0…","""cpd00003;Any""",
"""1.1.1_03""","""Any;cpd00001;cpd00003""","""[#1;D1R0:11][#7:3]1[#6:2]=[#7:…","""cpd00067;cpd00004;Any""",
"""1.1.1_04""","""Any;cpd00003""","""[#1;D1R0:9][#8;H1D2:10][#6:6]1…","""cpd00004;cpd00067;Any""",
…,…,…,…,…
"""5.3.99_01""","""Any""","""[#1;D1R0:12][#8;H1D2R0:9][#6;H…","""Any""",
"""5.3.99_02""","""Any""","""[#1;D1R0:12][#8;H1D2R0:9][#6;H…","""Any""",
"""5.4.2_01""","""Any""","""[#1;D1R0:11][#8;H1D2R0:10][#6;…","""Any""",
"""5.4.2_01.rev""","""Any""","""[#1;D1R0:11][#8;H1D2R0:4][#6:3…","""Any""",


## Atomcount for ruleset

In [3]:
import polars as pl
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import Counter



# Function to get atom counts and add to an existing Counter
def get_atom_counts(mol, existing_counts=None):
    if existing_counts is None:
        existing_counts = Counter()
    
    for atom in mol.GetAtoms():
        atom_symbol = atom.GetSymbol()
        existing_counts[atom_symbol] += 1
    
    return existing_counts

# Function to analyze a single SMARTS reaction
def analyze_reaction(smarts, reactant_atom_counts=None, product_atom_counts=None):
    reaction = AllChem.ReactionFromSmarts(smarts)
    
    if reactant_atom_counts is None:
        reactant_atom_counts = Counter()
    
    for reactant in reaction.GetReactants():
        mol = reactant
        reactant_atom_counts += get_atom_counts(mol, existing_counts=reactant_atom_counts)

    if product_atom_counts is None:
        product_atom_counts = Counter()
        
    for product in reaction.GetProducts():
        mol = product
        product_atom_counts += get_atom_counts(mol, existing_counts=product_atom_counts)
    
    return reactant_atom_counts, product_atom_counts


counter_reactants = Counter()
counter_products = Counter()

for smart in df_rules_enzymatic.select(pl.col("SMARTS")).iter_rows():
    smart = smart[0]
    counter_reactants, counter_products = analyze_reaction(smart, reactant_atom_counts=counter_reactants, product_atom_counts=counter_products)




In [4]:
counter_reactants, counter_products

(Counter({'C': 2198077409439858973192230400497770840897956004375209484262698314434307685984364336960469200266731418044804859220117005832597154124724817869991056443777993408779722,
          'H': 839045691602948208465233301240667858507757429570710891629314537699328401001677553465126169411597028185598380445460321577576631576295751663331829089854417944087598,
          'O': 606695509455423791419988145775326641945585742417027464217582006868080203827815682242836581040892617711860299383529401745220877968006428054375443854852168695477458,
          'N': 527824547548127435443650765614557380943524550288679082484066651870375459769610842225807129790097631299445776831702514530430803949285201969481700059392354438414336,
          '*': 449891379628218579882281256069771053250893523824235088359912943468343715836912830873329271734851243346469645150914588626006576656855900998215778582465576341676544,
          'S': 35961444702682067667160027190009988157295932130127512930082476006086676038684142271915632

In [6]:
all_counter_reactants = sum(list(counter_reactants.values()))
all_counter_products = sum(list(counter_products.values()))

In [10]:
print("--reactants--")
for key, value in counter_reactants.items():
    percentage = value/all_counter_reactants*100
    print(f'{key} - {percentage}')

--reactants--
C - 47.561635459417516
N - 11.420980266301624
H - 18.155131910474303
O - 13.127577096084961
* - 9.734675267721595
S - 7.781277930420234e-36
P - 1.4154068453457778e-47
Cl - 1.3451247425385068e-75


In [11]:
print("--products--")
for key, value in counter_reactants.items():
    percentage = value/all_counter_products*100
    print(f'{key} - {percentage}')

--products--
C - 25.20202693981058
N - 6.051765242513038
H - 9.620067078937906
O - 6.956059194227408
* - 5.158223555133231
S - 4.1231546000122755e-36
P - 7.49997789239888e-48
Cl - 7.127566087963198e-76
