In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import polars as pl
import numpy as np



df = pl.read_parquet("../data/statistic/reactions_compounds_list.csv")

In [2]:
# Filter out rows with empty lists
df_filtered = df.filter((pl.col("starting_compounds").list.len() > 0) & (pl.col("predicted_compounds").list.len() > 0))

# Ensure both columns have matching element counts
df_exploded = df_filtered.explode(['starting_compounds']).explode(['predicted_compounds'])

df_exploded

reaction_id,starting_compounds,predicted_compounds
str,str,str
"""R5d8539f1d9a5e857189956bad8eb4…","""Ce8bc5cd3aa30776ab6d35fdc2bcc4…","""C878f017efe6de2805a953d0ca9b84…"
"""R6a89aaf90529aa474f537c081d71f…","""Ce8bc5cd3aa30776ab6d35fdc2bcc4…","""C54130f1c76aaa5380fa631a6a6591…"
"""Rf0c39549766c89963dbb1a98f8f1d…","""Ce8bc5cd3aa30776ab6d35fdc2bcc4…","""Cfa5e885b86c8c37a465cad5238ed6…"
"""Rad931d485dae8cbc9aa07c1301163…","""Ce8bc5cd3aa30776ab6d35fdc2bcc4…","""C57a73b796ef9de341670ad4f89577…"
"""R02b3ad62ed7e42f819c6931a7d392…","""Ce8bc5cd3aa30776ab6d35fdc2bcc4…","""C8e1b680b68eec30be34c6b4857d63…"
…,…,…
"""Rc723ce445a5fba620734ccd14de89…","""C9e207ca3de0a74a8b2c0df254e4df…","""C482f27d0b29b0db769b25f0125ead…"
"""Rb8ddccb3d03c58c136ea291ba8e86…","""Ca9555f122d3c1dbb9dcc330c98bae…","""C70c44600418e9ee2471c9052a4f48…"
"""Rc52c2d326695e43c104ecd4a11b20…","""C92af25813ba38f4e1f385dc92b0e2…","""C5e8022bf3b0f1ee82b5ec45f5c89f…"
"""Rb993c3707a42c88a143334e22d4af…","""C326b3a90bc4bff0d2cf249e40ce51…","""Ca4ef9345ff26e4fec21cc7fb72fa9…"


In [3]:
print( df_exploded.describe() )

print("reaction_id", df_exploded[:, 'reaction_id'].unique().count())
print("starting compounds: ", df_exploded[:, 'starting_compounds'].unique().count())
print("predicted_compounds", df_exploded[:, 'predicted_compounds'].unique().count())


shape: (9, 4)
┌────────────┬────────────────────────────┬────────────────────────────┬───────────────────────────┐
│ statistic  ┆ reaction_id                ┆ starting_compounds         ┆ predicted_compounds       │
│ ---        ┆ ---                        ┆ ---                        ┆ ---                       │
│ str        ┆ str                        ┆ str                        ┆ str                       │
╞════════════╪════════════════════════════╪════════════════════════════╪═══════════════════════════╡
│ count      ┆ 3455876                    ┆ 3455876                    ┆ 3455876                   │
│ null_count ┆ 0                          ┆ 0                          ┆ 0                         │
│ mean       ┆ null                       ┆ null                       ┆ null                      │
│ std        ┆ null                       ┆ null                       ┆ null                      │
│ min        ┆ R0000002c9105808f4b4eff516 ┆ C0000ef293ecef3b3c5c31093a ┆ C000

In [None]:
# Create a directed graph
G = nx.DiGraph()

# Convert the DataFrame to a list of dictionaries
rows = df_exploded.to_dicts()

# Add edges to the graph
for row in rows:
    starting_compound = row['starting_compounds']
    predicted_compound = row['predicted_compounds']
    G.add_edge(starting_compound, predicted_compound)

# Draw the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, font_color="black", font_weight="bold", edge_color="gray")
plt.title('Compound Prediction Network')
plt.show()