# MINES - Results

## MongoDB (Querys)

### Starting Compounds --> Predicted

To analyse the results, it is more practical to make tables of interest. This can be done in compase (GUI for Mongo) and adapted to here.

In [1]:
#extract the _id of the MongoDB

from pymongo import MongoClient
import polars as pl

# change the configsetting, to see the full tables
pl.Config.set_tbl_rows(100)
pl.Config(fmt_str_lengths=550)


client = MongoClient('mongodb://localhost:27017/test-mines')
result = client['lotus_mines_enzymatic']['reactions'].aggregate([
    {
        '$project': {
            '_id': 1, 
            'reactants': {
                '$map': {
                    'input': '$Reactants', 
                    'as': 'reactant', 
                    'in': {
                        '$arrayElemAt': [
                            '$$reactant', 1
                        ]
                    }
                }
            }, 
            'products': {
                '$map': {
                    'input': '$Products', 
                    'as': 'product', 
                    'in': {
                        '$arrayElemAt': [
                            '$$product', 1
                        ]
                    }
                }
            }
        }
    }, {
        '$lookup': {
            'from': 'compounds', 
            'localField': 'reactants', 
            'foreignField': '_id', 
            'as': 'reactant_details'
        }
    }, {
        '$lookup': {
            'from': 'compounds', 
            'localField': 'products', 
            'foreignField': '_id', 
            'as': 'product_details'
        }
    }, {
        '$addFields': {
            'reactants': {
                '$map': {
                    'input': '$reactant_details', 
                    'as': 'detail', 
                    'in': {
                        'id': '$$detail._id', 
                        'type': '$$detail.Type'
                    }
                }
            }, 
            'products': {
                '$map': {
                    'input': '$product_details', 
                    'as': 'detail', 
                    'in': {
                        'id': '$$detail._id', 
                        'type': '$$detail.Type'
                    }
                }
            }
        }
    }, {
        '$project': {
            'reactant_details': 0, 
            'product_details': 0
        }
    }, {
        '$addFields': {
            'starting_compounds': {
                '$filter': {
                    'input': '$reactants', 
                    'as': 'reactant', 
                    'cond': {
                        '$eq': [
                            '$$reactant.type', 'Starting Compound'
                        ]
                    }
                }
            }, 
            'predicted_compounds': {
                '$filter': {
                    'input': '$products', 
                    'as': 'product', 
                    'cond': {
                        '$eq': [
                            '$$product.type', 'Predicted'
                        ]
                    }
                }
            }
        }
    }, {
        '$project': {
            '_id': 1, 
            'starting_compounds': '$starting_compounds.id', 
            'predicted_compounds': '$predicted_compounds.id'
        }
    }
])

In [2]:
%%time

# Collect all results into a list
result_list = list(result)

# Transform the result into a Polars DataFrame
df = pl.DataFrame({
    'reaction_id': [doc['_id'] for doc in result_list],
    'starting_compounds': [doc.get('starting_compounds', []) for doc in result_list],
    'predicted_compounds': [doc.get('predicted_compounds', []) for doc in result_list],
})

# Print the DataFrame
df

CPU times: user 50.8 s, sys: 18 s, total: 1min 8s
Wall time: 7min 15s


reaction_id,starting_compounds,predicted_compounds
str,list[str],list[str]
"""R5d8539f1d9a5e857189956bad8eb48e7b11137a25b12c2a7293fa1b61562e629""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C878f017efe6de2805a953d0ca9b8491274a29290""]"
"""R6a89aaf90529aa474f537c081d71ff584fa0179bdde8269c9ae4b80b2e5c96f7""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C54130f1c76aaa5380fa631a6a659121284978c5d""]"
"""Rf0c39549766c89963dbb1a98f8f1d4b89431cf4ee60f962c7703ef9df5bcb327""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""Cfa5e885b86c8c37a465cad5238ed62672498a45d""]"
"""Rad931d485dae8cbc9aa07c1301163f00258690adddbc7279de2b5c92ebca656f""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C57a73b796ef9de341670ad4f895779c4ce0d4623""]"
"""R02b3ad62ed7e42f819c6931a7d392b90a5478a1791f2d02145fb0d73fc31b151""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C8e1b680b68eec30be34c6b4857d630e2c245759d""]"
"""R74fde49e6e163ead723cc76900847801f1ddf2a6272bd405ab1cba8fa060de97""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""Cc65bdd68f5ca4038b80b1eb2be0d5434f75156dc""]"
"""R03894cee5bc4149f878b0ef38cbf20535406285eaed4fffdf8825fd2da205801""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""Cb969d7dca7f60bdb827dc3efb4fc22789592ef6e""]"
"""Rd7278d127735d1801a3017dba5ae870010a33b029c9c1991907db9edebb67f6c""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C037f60c0477191d54119504e8976d0d64d8c8ce6""]"
"""R57fbe72efaa8f7f21c5a7449fb0719490e751f955ad08d793b1628b65bd5f263""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""Cceca72da2195f029e035132a30c3ae5b2f5b68b0""]"
"""Rfdba4f6c149488605e09370cb664efbd7f4aa514de07bc4808bf5b07b18ad02b""","[""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""]","[""C00eb38722ea74e3671d815c444f48e0db6315491""]"


In [3]:
df_exploded = df.explode("predicted_compounds").explode("starting_compounds")

df_exploded.filter(pl.col("starting_compounds") == "C9389b0caead11dff9e979f17a67a7e715d61fdcf")

reaction_id,starting_compounds,predicted_compounds
str,str,str
"""R83c597e7ad3315ddff7b885d95fc564b5532a058da1e1eb7b19ce6c3de894c42""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""C88a4b1c7c5bbd79ad363234a3d4280d241ef901f"""
"""Rd432c21a3dfc629900433a1bb19228cc1a1587b3c889ef006565d3101e8c1e98""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""Cb14dbb55aeecabfddf02b50ddfcf4209b6bc8b89"""
"""R5e91cacb3f110dfae471cc8db54c7767e858cbcbda42ba95a618a3999874034a""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""Cb2aa27c3f54d04c5db9a75ba81989dd7986e8343"""
"""R1776e7ef92a6e7b1d243f3ecb4cf3a385594ef106e218a6e4919bc52a2bb88f8""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""Cacadd5515bba34678cd87a4e49756cf547f66fb3"""
"""R00ce0ad711aa993eb65321fb890a2ddec7d7c30d01475df2bd89a4754a81b4e7""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""Ce686e45a8fce4313f5abd6c881773f82de6f9634"""
"""Rbc5041dd1afc6d1b990a719b95e66847727323688be00411c228d330f0c0f1de""","""C9389b0caead11dff9e979f17a67a7e715d61fdcf""","""C85c4ef956affdf5c9a17c4d3d8c935fd50bec77d"""


In [4]:
df_starting_compounds = df.with_columns(counts_starting_compounds = pl.col("starting_compounds").list.len())
df_starting_compounds_unique = df_starting_compounds.select(pl.col("counts_starting_compounds").unique())

df_predicted_compounds = df.with_columns(counts_predicted_compounds = pl.col("predicted_compounds").list.len())
df_predicted_compounds_unique = df_predicted_compounds.select(pl.col("counts_predicted_compounds").unique())

df_starting_compounds_unique.max(), df_predicted_compounds_unique.max()

(shape: (1, 1)
 ┌───────────────────────────┐
 │ counts_starting_compounds │
 │ ---                       │
 │ u32                       │
 ╞═══════════════════════════╡
 │ 1                         │
 └───────────────────────────┘,
 shape: (1, 1)
 ┌────────────────────────────┐
 │ counts_predicted_compounds │
 │ ---                        │
 │ u32                        │
 ╞════════════════════════════╡
 │ 2                          │
 └────────────────────────────┘)

In [4]:
#extract the _id of the MongoDB

from pymongo import MongoClient
import polars as pl

# change the configsetting, to see the full tables
pl.Config.set_tbl_rows(100)
pl.Config(fmt_str_lengths=550)


client = MongoClient('mongodb://localhost:27017/test-mines')
result = client['lotus_mines_enzymatic']['reactions'].aggregate([
    {
        '$project': {
            '_id': 1, 
            'reactants': {
                '$map': {
                    'input': '$Reactants', 
                    'as': 'reactant', 
                    'in': {
                        '$arrayElemAt': [
                            '$$reactant', 1
                        ]
                    }
                }
            }, 
            'products': {
                '$map': {
                    'input': '$Products', 
                    'as': 'product', 
                    'in': {
                        '$arrayElemAt': [
                            '$$product', 1
                        ]
                    }
                }
            }
        }
    }, {
        '$lookup': {
            'from': 'compounds', 
            'localField': 'reactants', 
            'foreignField': '_id', 
            'as': 'reactant_details'
        }
    }, {
        '$lookup': {
            'from': 'compounds', 
            'localField': 'products', 
            'foreignField': '_id', 
            'as': 'product_details'
        }
    }, {
        '$addFields': {
            'reactants': {
                '$map': {
                    'input': '$reactant_details', 
                    'as': 'detail', 
                    'in': {
                        'id': '$$detail._id', 
                        'ID': '$$detail.ID', 
                        'type': '$$detail.Type'
                    }
                }
            }, 
            'products': {
                '$map': {
                    'input': '$product_details', 
                    'as': 'detail', 
                    'in': {
                        'id': '$$detail._id', 
                        'ID': '$$detail.ID', 
                        'type': '$$detail.Type'
                    }
                }
            }
        }
    }, {
        '$project': {
            'reactant_details': 0, 
            'product_details': 0
        }
    }, {
        '$addFields': {
            'starting_compounds': {
                '$filter': {
                    'input': '$reactants', 
                    'as': 'reactant', 
                    'cond': {
                        '$eq': [
                            '$$reactant.type', 'Starting Compound'
                        ]
                    }
                }
            }, 
            'predicted_compounds': {
                '$filter': {
                    'input': '$products', 
                    'as': 'product', 
                    'cond': {
                        '$eq': [
                            '$$product.type', 'Predicted'
                        ]
                    }
                }
            }
        }
    }, {
        '$project': {
            'ID': 1, 
            'starting_compounds_ID': '$starting_compounds.ID', 
            'predicted_compounds_ID': '$predicted_compounds.ID'
        }
    }
])

In [5]:
%%time

# Collect all results into a list
result_list = list(result)

# Transform the result into a Polars DataFrame
df = pl.DataFrame({
    'reaction_id': [doc['_id'] for doc in result_list],
    'starting_compounds_ID': [doc.get('starting_compounds_ID', []) for doc in result_list],
    'predicted_compounds_ID': [doc.get('predicted_compounds_ID', []) for doc in result_list],
})

# Print the DataFrame
df

CPU times: user 45.8 s, sys: 12.3 s, total: 58.1 s
Wall time: 7min 2s


reaction_id,starting_compounds_ID,predicted_compounds_ID
str,list[str],list[str]
"""R5d8539f1d9a5e857189956bad8eb48e7b11137a25b12c2a7293fa1b61562e629""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc1739756""]"
"""R6a89aaf90529aa474f537c081d71ff584fa0179bdde8269c9ae4b80b2e5c96f7""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc1078391""]"
"""Rf0c39549766c89963dbb1a98f8f1d4b89431cf4ee60f962c7703ef9df5bcb327""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc3211736""]"
"""Rad931d485dae8cbc9aa07c1301163f00258690adddbc7279de2b5c92ebca656f""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc1124283""]"
"""R02b3ad62ed7e42f819c6931a7d392b90a5478a1791f2d02145fb0d73fc31b151""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc1823532""]"
"""R74fde49e6e163ead723cc76900847801f1ddf2a6272bd405ab1cba8fa060de97""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc2544386""]"
"""R03894cee5bc4149f878b0ef38cbf20535406285eaed4fffdf8825fd2da205801""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc2378693""]"
"""Rd7278d127735d1801a3017dba5ae870010a33b029c9c1991907db9edebb67f6c""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc0044980""]"
"""R57fbe72efaa8f7f21c5a7449fb0719490e751f955ad08d793b1628b65bd5f263""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc2652334""]"
"""Rfdba4f6c149488605e09370cb664efbd7f4aa514de07bc4808bf5b07b18ad02b""","[""MWERDWMFVGCBLL-JYJNAYRXSA-N""]","[""pkc0011818""]"


In [6]:
df_exploded = df.explode("starting_compounds_ID").explode("predicted_compounds_ID")

df_exploded.filter(pl.col("starting_compounds_ID") == "MWERDWMFVGCBLL-JYJNAYRXSA-N")

reaction_id,starting_compounds_ID,predicted_compounds_ID
str,str,str
"""R5d8539f1d9a5e857189956bad8eb48e7b11137a25b12c2a7293fa1b61562e629""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1739756"""
"""R6a89aaf90529aa474f537c081d71ff584fa0179bdde8269c9ae4b80b2e5c96f7""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1078391"""
"""Rf0c39549766c89963dbb1a98f8f1d4b89431cf4ee60f962c7703ef9df5bcb327""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc3211736"""
"""Rad931d485dae8cbc9aa07c1301163f00258690adddbc7279de2b5c92ebca656f""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1124283"""
"""R02b3ad62ed7e42f819c6931a7d392b90a5478a1791f2d02145fb0d73fc31b151""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1823532"""
"""R74fde49e6e163ead723cc76900847801f1ddf2a6272bd405ab1cba8fa060de97""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2544386"""
"""R03894cee5bc4149f878b0ef38cbf20535406285eaed4fffdf8825fd2da205801""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2378693"""
"""Rd7278d127735d1801a3017dba5ae870010a33b029c9c1991907db9edebb67f6c""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc0044980"""
"""R57fbe72efaa8f7f21c5a7449fb0719490e751f955ad08d793b1628b65bd5f263""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2652334"""
"""Rfdba4f6c149488605e09370cb664efbd7f4aa514de07bc4808bf5b07b18ad02b""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc0011818"""


In [8]:
#df_exploded.write_parquet("../data/MINES/reactions_compounds_list_ID.parquet")

In [25]:
### testfield for taxom_finder...

df = pl.read_parquet("../data/MINES/reactions_compounds_list_ID.parquet")

# Assuming df is your Polars DataFrame and 'starting_compounds_ID' is the column to filter
pattern = r'^\w{14}-\w{11}-\w{1}$'
filtered_df = df.filter(pl.col('starting_compounds_ID').str.contains(pattern))

predicted_elements_list = df["starting_compounds_ID"].unique().to_list()

predicted_elements_list


reaction_id,starting_compounds_ID,predicted_compounds_ID
str,str,str
"""R5d8539f1d9a5e857189956bad8eb48e7b11137a25b12c2a7293fa1b61562e629""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1739756"""
"""R6a89aaf90529aa474f537c081d71ff584fa0179bdde8269c9ae4b80b2e5c96f7""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1078391"""
"""Rf0c39549766c89963dbb1a98f8f1d4b89431cf4ee60f962c7703ef9df5bcb327""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc3211736"""
"""Rad931d485dae8cbc9aa07c1301163f00258690adddbc7279de2b5c92ebca656f""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1124283"""
"""R02b3ad62ed7e42f819c6931a7d392b90a5478a1791f2d02145fb0d73fc31b151""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc1823532"""
"""R74fde49e6e163ead723cc76900847801f1ddf2a6272bd405ab1cba8fa060de97""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2544386"""
"""R03894cee5bc4149f878b0ef38cbf20535406285eaed4fffdf8825fd2da205801""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2378693"""
"""Rd7278d127735d1801a3017dba5ae870010a33b029c9c1991907db9edebb67f6c""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc0044980"""
"""R57fbe72efaa8f7f21c5a7449fb0719490e751f955ad08d793b1628b65bd5f263""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc2652334"""
"""Rfdba4f6c149488605e09370cb664efbd7f4aa514de07bc4808bf5b07b18ad02b""","""MWERDWMFVGCBLL-JYJNAYRXSA-N""","""pkc0011818"""


## Analysis

In [5]:
import matplotlib.pyplot as plt
import polars as pl
import numpy as np

# change the configsetting, to see the full tables
pl.Config.set_tbl_rows(7)
pl.Config(fmt_str_lengths=550)

df = pl.read_parquet("../data/MINES/reactions_compounds_list.parquet")

In [6]:
# Filter out rows with empty lists
df_filtered = df.filter((pl.col("starting_compounds").list.len() > 0) & (pl.col("predicted_compounds").list.len() > 0))

# Ensure both columns have matching element counts
df_exploded = df_filtered.explode(['starting_compounds']).explode(['predicted_compounds'])

df_exploded

reaction_id,starting_compounds,predicted_compounds
str,str,str
"""R5d8539f1d9a5e857189956bad8eb48e7b11137a25b12c2a7293fa1b61562e629""","""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""","""C878f017efe6de2805a953d0ca9b8491274a29290"""
"""R6a89aaf90529aa474f537c081d71ff584fa0179bdde8269c9ae4b80b2e5c96f7""","""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""","""C54130f1c76aaa5380fa631a6a659121284978c5d"""
"""Rf0c39549766c89963dbb1a98f8f1d4b89431cf4ee60f962c7703ef9df5bcb327""","""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""","""Cfa5e885b86c8c37a465cad5238ed62672498a45d"""
"""Rad931d485dae8cbc9aa07c1301163f00258690adddbc7279de2b5c92ebca656f""","""Ce8bc5cd3aa30776ab6d35fdc2bcc4707f4ac2919""","""C57a73b796ef9de341670ad4f895779c4ce0d4623"""
…,…,…
"""Rc52c2d326695e43c104ecd4a11b20ead83987b49c39bbc0c165c7d25b8fef8b5""","""C92af25813ba38f4e1f385dc92b0e253dc7a01f6c""","""C5e8022bf3b0f1ee82b5ec45f5c89f69ea2ab5a29"""
"""Rb993c3707a42c88a143334e22d4af7eacf0dde11657106b272262f033e17d424""","""C326b3a90bc4bff0d2cf249e40ce517af77a62d0d""","""Ca4ef9345ff26e4fec21cc7fb72fa9dca2f86fcc5"""
"""Rc1fdf76ef8f34b7c3a7538fcb9924dd3a44615fd990b2da338cc61b791734e6d""","""C62f5460835a65fcb75c6c8da0f7f6f90c060d900""","""Ce9a6758cc6c58ca618292cde869df568a1c34cb8"""


In [7]:
print( df_exploded.describe() )

print("reaction_id:", df_exploded[:, 'reaction_id'].unique().count())
print("starting compounds: ", df_exploded[:, 'starting_compounds'].unique().count())
print("predicted_compounds:", df_exploded[:, 'predicted_compounds'].unique().count())


shape: (9, 4)
┌────────────┬────────────────────────────┬────────────────────────────┬───────────────────────────┐
│ statistic  ┆ reaction_id                ┆ starting_compounds         ┆ predicted_compounds       │
│ ---        ┆ ---                        ┆ ---                        ┆ ---                       │
│ str        ┆ str                        ┆ str                        ┆ str                       │
╞════════════╪════════════════════════════╪════════════════════════════╪═══════════════════════════╡
│ count      ┆ 3455876                    ┆ 3455876                    ┆ 3455876                   │
│ null_count ┆ 0                          ┆ 0                          ┆ 0                         │
│ mean       ┆ null                       ┆ null                       ┆ null                      │
│ std        ┆ null                       ┆ null                       ┆ null                      │
│ …          ┆ …                          ┆ …                          ┆ …   