## Preparation


In [None]:
import mordred as md
import numpy as np
import pandas as pd
from mordred import Autocorrelation, BaryszMatrix
from rdkit import Chem
from rdkit.Chem import AllChem, MolFromSmiles, MolToSmiles
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map

from yellow_cards_workflow import BASE_DIR

In [None]:
properties_df = pd.read_csv(BASE_DIR / "data/input/gdb9.sdf.csv")
properties_df

In [None]:
supplier = Chem.SDMolSupplier(BASE_DIR / "data/input/gdb9.sdf")

In [4]:
molecules = np.array([x for x in tqdm(supplier)])
print(np.count_nonzero(molecules))

  0%|          | 0/133885 [00:00<?, ?it/s]

[14:44:22] Explicit valence for atom # 1 C, 5, is greater than permitted
[14:44:22] ERROR: Could not sanitize molecule ending on line 9097
[14:44:22] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[14:44:22] Explicit valence for atom # 1 C, 5, is greater than permitted
[14:44:22] ERROR: Could not sanitize molecule ending on line 35785
[14:44:22] ERROR: Explicit valence for atom # 1 C, 5, is greater than permitted
[14:44:22] Explicit valence for atom # 4 C, 5, is greater than permitted
[14:44:22] ERROR: Could not sanitize molecule ending on line 62866
[14:44:22] ERROR: Explicit valence for atom # 4 C, 5, is greater than permitted
[14:44:22] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:22] ERROR: Could not sanitize molecule ending on line 66832
[14:44:22] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:22] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:22] ERROR: Could not sanitize molecule en

132737


[14:44:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] ERROR: Could not sanitize molecule ending on line 664156
[14:44:23] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] ERROR: Could not sanitize molecule ending on line 664543
[14:44:23] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] ERROR: Could not sanitize molecule ending on line 664576
[14:44:23] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] ERROR: Could not sanitize molecule ending on line 664869
[14:44:23] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[14:44:23] ERROR: Could not sanitize molecu

In [None]:
valid_mask = np.array([i for i, x in enumerate(molecules) if x is not None])
valid_molecules = molecules[valid_mask]
valid_noniso_smiles = np.array(
    list(
        map(
            lambda x: Chem.MolToSmiles(x, isomericSmiles=False, canonical=True),
            tqdm(valid_molecules),
        )
    )
)

  0%|          | 0/132737 [00:00<?, ?it/s]

In [6]:
unique_noniso_smiles, unique_idx = np.unique(valid_noniso_smiles, return_index=True)
len(unique_noniso_smiles)

132398

In [7]:
clean_mask = np.sort(valid_mask[unique_idx])
len(clean_mask)

132398

In [8]:
values = -1 * properties_df.loc[:, "g298_atom"] / 1000
values.iloc[clean_mask].describe()

count    132398.000000
mean          1.630663
std           0.220153
min           0.201407
25%           1.489943
50%           1.633793
75%           1.772368
max           2.417122
Name: g298_atom, dtype: float64

In [9]:
molecules = molecules[clean_mask]
values = values.to_numpy()[clean_mask]

## Clean Dataframe


In [None]:
smiles = np.array(
    list(
        map(
            lambda x: Chem.MolToSmiles(x, isomericSmiles=True, canonical=True),
            tqdm(molecules),
        )
    )
)

  0%|          | 0/132398 [00:00<?, ?it/s]

In [None]:
properties_df = pd.read_csv(BASE_DIR / "data/input/gdb9.sdf.csv")
properties_df

In [None]:
clean_df = properties_df.iloc[clean_mask, :].copy()
clean_df.insert(1, "smiles", smiles)
clean_df = clean_df.sort_values("smiles")
clean_df.to_csv(BASE_DIR / "data/processed/qm9-clean.csv", sep=";", index=False)
clean_df

In [None]:
clean_unsorted_df = properties_df.iloc[clean_mask, :].copy()
clean_unsorted_df.insert(1, "smiles", smiles)
# clean_unsorted_df = clean_unsorted_df.sort_values("smiles")
clean_unsorted_df.to_csv(BASE_DIR / "data/processed/qm9-clean-unsorted.csv", sep=";")
clean_unsorted_df

In [None]:
np.save(BASE_DIR / "data/processed/qm9-mask.npy", clean_mask)

## Shuffled Dataset


In [None]:
properties_df = pd.read_csv(BASE_DIR / "data/input/gdb9.sdf.csv")
properties_df

In [None]:
clean_properties_df = properties_df.iloc[clean_mask, :].copy(deep=True)
for i in [0.5, 1, 2.5, 5, 10]:
    rng = np.random.default_rng(seed=42)
    new_properties_df = clean_properties_df.copy(deep=True)
    shuffle_idx = rng.choice(
        len(clean_properties_df),
        size=int((i / 100) * len(clean_properties_df)),
        replace=False,
        shuffle=True,
    )
    new_properties_df.iloc[sorted(shuffle_idx), 1:] = clean_properties_df.iloc[
        shuffle_idx, 1:
    ]
    new_properties_df.to_csv(BASE_DIR / f"data/processed/qm9-shuffle-{i}.csv", sep=";")

## Variable Constant Value Addition


In [None]:
properties_df = pd.read_csv(BASE_DIR / "data/input/gdb9.sdf.csv")
properties_df

In [13]:
properties_df["g298_atom"].describe()

count    133885.000000
mean      -1629.388196
std         220.207088
min       -2417.121997
25%       -1771.350603
50%       -1632.224955
75%       -1488.291333
max        -201.407171
Name: g298_atom, dtype: float64

In [None]:
clean_properties_df = properties_df.iloc[clean_mask, :].copy(deep=True)
for i in [0.5, 1, 2.5, 5, 10]:
    for delta in [
        0.005,
        0.0075,
        0.01,
        0.0125,
        0.015,
        0.0175,
        0.0200,
        0.0225,
        0.025,
        0.05,
        0.075,
        0.08,
        0.1,
        0.125,
        0.15,
        0.175,
        0.2,
        0.25,
    ]:
        rng = np.random.default_rng(seed=42)
        new_mix_df = clean_properties_df.copy(deep=True)
        mod_idx = rng.choice(
            len(clean_properties_df),
            size=int((i / 100) * len(clean_properties_df)),
            replace=False,
            shuffle=False,
        )
        new_mix_df.iloc[mod_idx, 1:] = new_mix_df.iloc[mod_idx, 1:] - 1000 * delta
        new_mix_df.to_csv(
            BASE_DIR / f"data/processed/qm9-variable-mu-{delta}-{i}.csv", sep=";"
        )