## Evaluate fragment linkings

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from plotly import express as px
from tqdm import tqdm

## Load data

In [None]:
pred_dir = Path("/homes/buttensc/Projects/semla-flow/predictions/fragment")
files = list(pred_dir.glob("*.csv"))
dfs = []
for file in tqdm(files):
    df = pd.read_csv(file)
    parts = Path(file).stem.split("_")
    df["method"] = " ".join(parts[0:2]) + " " + (parts[3] if len(parts) > 4 else "")
    df["comparison"] = parts[-1]
    df = df.dropna(subset=["Reference molecule"])
    dfs.append(df)
df = pd.concat(dfs).sort_values(["method", "Reference molecule"])


In [None]:
df

## Metrics

In [None]:
metrics = {"sucos": "SuCOS", "tanimoto": "ECFP4 Bit Tanimoto"}

In [None]:
metric = "sucos"
name = metrics[metric]

g = sns.FacetGrid(df, col="comparison", hue="method", height=5, aspect=1.3)
g.map(
    sns.histplot,
    metric,
    bins=50,
    common_norm=False,
    stat="density",
    element="step",
    # kde=True,
    fill=False,
)
g.add_legend()

In [None]:
metric = "tanimoto"
name = metrics[metric]

g = sns.FacetGrid(df, col="comparison", hue="method", height=5, aspect=1.3)
g.map(
    sns.histplot,
    metric,
    bins=50,
    common_norm=False,
    stat="density",
    element="step",
    # kde=True,
    fill=False,
)
g.add_legend()

In [None]:
g = sns.FacetGrid(df, col="method", row="comparison", height=3, aspect=1.3)
g.map(sns.kdeplot, "num_atoms_cond", "num_atoms_pred")

In [None]:
## Try pairing the data
pred_dir = Path("/homes/buttensc/Projects/semla-flow/predictions/fragment")
files = list(pred_dir.glob("*.csv"))
dfs_linker, dfs_fragment = [], []
for file in tqdm(files):
    comparison = parts[-1]
    if comparison == "linker":
        df = pd.read_csv(file)
        parts = Path(file).stem.split("_")
        df["method"] = " ".join(parts[0:2]) + " " + (parts[3] if len(parts) > 4 else "")
        df = df.dropna(subset=["Reference molecule"])
        dfs_linker.append(df)
    elif comparison == "fragment":
        df = pd.read_csv(file)
        parts = Path(file).stem.split("_")
        df["method"] = " ".join(parts[0:2]) + " " + (parts[3] if len(parts) > 4 else "")
        df = df.dropna(subset=["Reference molecule"])
        dfs_fragment.append(df)
df_linker = (
    pd.concat(dfs_linker)
    .sort_values(["method", "Reference molecule"])
    .set_index(["smiles_pred", "Reference molecule"])
)
df_fragment = (
    pd.concat(dfs_fragment)
    .sort_values(["method", "Reference molecule"])
    .set_index(["smiles_pred", "Reference molecule"])
)
