# Plot distributions

In [None]:
import pandas as pd
from plotly import express as px
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt

## Load data

In [None]:
df_gcdm = pd.read_csv("predictions/unconditional/gcdm_100000_predictions.csv")
df_gcdm["method"] = "GCDM-SBDD"
df_semla = pd.read_csv("predictions/unconditional/semlaflow_100000_predictions.csv")
df_semla["method"] = "SemlaFlow"
df_flowmol = pd.read_csv("predictions/unconditional/molflow_100000_predictions.csv")
df_flowmol["method"] = "FlowMol"
df_train = pd.read_csv("data/unconditional/geom-drugs/train.csv")
df_train["method"] = "GEOM Drugs Training"
# df_val = pd.read_csv("predictions_data/val.csv")
# df_val["method"] = "GEOM Drugs Validation"
# df_test = pd.read_csv("predictions_data/test.csv")
# df_test["method"] = "GEOM Drugs Testing"
df = pd.concat([df_flowmol, df_gcdm, df_semla, df_train])

## Name metrics

In [None]:
metrics = {
    "ensemble_avg_energy": "Ensemble Average Energy",
    "mol_pred_energy": "Molecular Prediction Energy",
    "energy_ratio": "Energy Ratio",
    "sa": "Synthetic Accessability Score",
    "sa_normalized": "Synthetic Accessability Score (normalized)",
    "spacial": "Spacial Score",
    "qed": "Quantitative Estimation of Drug-likeness",
    "logp": "LogP",
    "lipinski": "Lipinski Rule of 5",
    "num_heavy": "Number of Heavy Atoms",
    "weight": "Molecular Weight",
    "num_rings": "Number of Rings",
}

## Plot one

In [None]:
# seaborn

metric = "logp"  # lipophilicity
metric = "sa"
name = metrics[metric]
sns.histplot(
    df[["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    bins=100,
    cumulative=False,
    common_norm=False,
    stat="density",
    element="step",
    # legend=True, palette="tab10", linewidth=1.5
)
plt.title(name)
plt.xlabel(name)
plt.xlim(0, 10)

In [None]:
# seaborn

metric = "logp"  # lipophilicity
metric = "sa"
metric = "spacial"
name = metrics[metric]
sns.histplot(
    df[["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    bins=100,
    cumulative=False,
    common_norm=False,
    stat="density",
    element="step",
    # legend=True, palette="tab10", linewidth=1.5
)
plt.title(name)
plt.xlabel(name)
plt.xlim(0, 120)

In [None]:
# seaborn

metric = "logp"  # lipophilicity
metric = "sa"
metric = "qed"
name = metrics[metric]
sns.histplot(
    df[["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    bins=100,
    cumulative=False,
    common_norm=False,
    stat="density",
    element="step",
    # legend=True, palette="tab10", linewidth=1.5
)
plt.title(name)
plt.xlabel(name)
plt.xlim(0, 1)

In [None]:
# seaborn

metric = "energy_ratio"
name = "Energy Ratio"
sns.histplot(
    df[["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    cumulative=False,
    common_norm=False,
    stat="density",
    element="step",
    log_scale=True if metric == "energy_ratio" else False,
    # legend=True, palette="tab10", linewidth=1.5
)
plt.title(name)
plt.xlabel(name)
plt.xlim(0.5, 20)
# plt.savefig(f"plots/plot_{metric}.png")
# plt.close()

## Plot and save all

In [None]:
for metric, name in tqdm(metrics.items()):
    sns.histplot(
        df[["method", metric]].reset_index(drop=True),
        x=metric,
        hue="method",
        cumulative=False,
        common_norm=False,
        stat="density",
        element="step",
        # legend=True, palette="tab10", linewidth=1.5
    )
    plt.title(name)
    plt.xlabel(name)
    plt.savefig(f"plots/unconditonal/plot_{metric}.png")
    plt.close()

# OLD

In [None]:
sns.histplot(
    df[df.method.str.startswith("GEOM")][["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    cumulative=False,
    fill=False,
    common_norm=False,
    stat="density",
    element="step",
    log_scale=True if metric == "energy_ratio" else False,
    # legend=True, palette="tab10", linewidth=1.5
)
sns.histplot(
    df[~df.method.str.startswith("GEOM")][["method", metric]].reset_index(drop=True),
    x=metric,
    hue="method",
    cumulative=False,
    fill=True,
    common_norm=False,
    stat="density",
    element="step",
    log_scale=True if metric == "energy_ratio" else False,
    # legend=True, palette="tab10", linewidth=1.5
)
plt.title(name)
plt.xlabel(name)
plt.xlim(0.5, 20)
plt.show()