# Generate table values

In [None]:
import pandas as pd
from plotly import express as px
from tqdm import tqdm
import numpy as np

## Load data

In [None]:
df_gcdm = pd.read_csv("predictions/unconditional/gcdm_100000_predictions.csv")
df_gcdm["method"] = "GCDM-SBDD"
df_semla = pd.read_csv("predictions/unconditional/semlaflow_100000_predictions.csv")
df_semla["method"] = "SemlaFlow"
df_flowmol = pd.read_csv("predictions/unconditional/molflow_100000_predictions.csv")
df_flowmol["method"] = "FlowMol"
df_train = pd.read_csv("data/unconditional/geom-drugs/train.csv")
df_train["method"] = "GEOM Drugs Training"
# df_val = pd.read_csv("evaluation/truth/val.csv")
# df_val["method"] = "GEOM Drugs Validation"
# df_test = pd.read_csv("evaluation/truth/test.csv")
# df_test["method"] = "GEOM Drugs Testing"
df = pd.concat([df_flowmol, df_gcdm, df_semla, df_train])

df["connected"] = df["connected"].astype(float).fillna(False)
df["chemical"] = df["chemical"].astype(float).fillna(False)
df["physical"] = df["physical"].astype(float).fillna(False)
df["fail"] = df["fail"].astype(float).fillna(False)

In [None]:
cols = [
    "fail",
    "connected",
    "chemical",
    "physical",
    "ensemble_avg_energy",
    "mol_pred_energy",
    "energy_ratio",
    "sa",
    "sa_normalized",
    "spacial",
    "qed",
    "logp",
    "lipinski",
    "num_heavy",
    "weight",
    "num_rings",
]


In [None]:
# make wide into long table
df_long = pd.melt(df, id_vars=["smiles", "method"], value_vars=cols).drop(
    columns=["smiles"]
)

In [None]:
df_long["value"] = df_long["value"].astype(float)

In [None]:
pd.set_option("display.max_rows", 150)

In [None]:
df_desc = df_long.groupby(["method", "variable"]).describe()
# swap axes

In [None]:
# swap axes
df_tab = (
    df_desc.drop(columns=["count"], level=1)
    .unstack()
    .T.reset_index()
    .rename(columns={"level_1": "agg"})
    .drop(columns=["level_0"])
    .set_index(["agg", "variable"])
)
# format to 2 decimal places
df_tab.applymap(lambda x: "{:.2f}".format(x))

# Break into subtables? Shown here for MEAN

In [None]:
df_mean = df_long.groupby(["method", "variable"]).std()  # sum() / 1e5
df_mean.index.name = None
df_mean = df_mean.unstack()
df_mean.index.name = None
df_mean = df_mean.droplevel(0, axis=1)
df_mean.index.name = None


In [None]:
methods = ["GCDM-SBDD", "SemlaFlow", "FlowMol"]
data = ["GEOM Drugs Training", "GEOM Drugs Validation", "GEOM Drugs Testing"]

In [None]:
# sanity
cols = ["fail", "connected", "chemical", "physical"]
df_mean.loc[methods, cols].map(lambda x: "{:.2f}".format(x))

NOTE:
- Check that the mean here is correct and is not missing the NAN values.
- Every method should have generated 100 000 molecules and so the failures should be relative to this number.


In [None]:
# descriptors
cols = ["num_heavy", "weight", "num_rings", "logp"]
df_mean.loc[methods, cols].map(lambda x: "{:.2f}".format(x))

In [None]:
# molecular scores - molecule
cols = ["sa", "spacial", "qed", "lipinski"]
df_mean.loc[methods, cols].map(lambda x: "{:.2f}".format(x))

In [None]:
# pose scores - conformation
cols = ["ensemble_avg_energy", "energy_ratio"]
df_mean.loc[methods, cols].map(lambda x: "{:.2f}".format(x))

# Uniqueness and novelty

In [None]:
# training_smiles = set(l.strip() for l in open("evaluation/truth/train.smiles").readlines())
training_smiles = set(df[df.method == "GEOM Drugs Training"]["smiles"].dropna()) - {
    None,
    "",
    pd.NA,
    np.nan,
}

In [None]:
def compute_uniqueness(smiles: list[str]) -> float:
    """Compute the uniqueness of a list of SMILES strings."""
    valid_smiles = [s for s in smiles if s not in {None, "", pd.NA, np.nan}]
    return len(set(valid_smiles)) / len(valid_smiles)


def compute_novelty(
    smiles: list[str], training_smiles: set[str] = training_smiles
) -> float:
    """Compute the novelty of a list of SMILES strings."""
    valid_smiles = [s for s in smiles if s not in {None, "", pd.NA, np.nan}]
    novel_smiles = [s for s in valid_smiles if s not in training_smiles]
    return len(novel_smiles) / len(valid_smiles)


In [None]:
# uniqueness - out of the valid predictions, how many unique ones are there?
df.groupby("method")["smiles"].apply(compute_uniqueness)

In [None]:
# novelty - out of the valid smiles, how many are not in the training set?
df.groupby("method")["smiles"].apply(compute_novelty)