##### Imports

In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import yaml
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import urllib.request

sys.path.append(str(Path("..") / "datasets" / "general-descriptors-datasets"))

from chem_data import MolBoil, MolHenry, MolLogP, MolMelt, NPLogP, NPZetaP, ProtSol

DATA_PATH = Path("..") / "datasets"

SARCOS_PATH = DATA_PATH / "sarcos"

round_digits = 3

r = lambda x: round(x, round_digits)

##### Metadata

In [2]:
train_frac, val_frac, test_frac = 0.7, 0.2, 0.1

sum_frac = round(train_frac + val_frac + test_frac, 10)
assert sum_frac == 1.0, sum_frac

random_state = np.random.RandomState(0)

# Note that this random state is used for all splits, so it changes over time
shared_params = {
    "shuffle": True,
}

train_test_params = {
    **shared_params,
    "train_size": r(train_frac + val_frac),
    "test_size": r(test_frac),
}

train_val_test_params = {
    **shared_params,
    "train_size": r(train_frac / (1 - test_frac)),
    "test_size": r(val_frac / (1 - test_frac)),
}

print(train_test_params)
print(train_val_test_params)

{'shuffle': True, 'train_size': 0.9, 'test_size': 0.1}
{'shuffle': True, 'train_size': 0.778, 'test_size': 0.222}


##### Function definitions

In [3]:
def get_state(random_state: np.random.RandomState) -> list:
    state = random_state.get_state()

    # cast the state to something that is yaml serializable
    return [
        str(state[0]),
        list(map(int, state[1])),
        int(state[2]),
        int(state[3]),
        float(state[4]),
    ]


def preprocess_shared(
    x: np.ndarray,
    y: np.ndarray,
    path: Path,
    stratify: bool,
) -> None:
    tts_random_state = get_state(random_state)
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        **train_test_params,
        stratify=y if stratify else None,
        random_state=random_state,
    )

    tvs_random_state = get_state(random_state)
    x_train, x_val, y_train, y_val = train_test_split(
        x_train,
        y_train,
        **train_val_test_params,
        stratify=y_train if stratify else None,
        random_state=random_state,
    )

    path.mkdir(parents=True, exist_ok=True)

    np.savetxt(path / "x_train.csv", x_train, delimiter=",")
    np.savetxt(path / "y_train.csv", y_train, delimiter=",")
    np.savetxt(path / "x_val.csv", x_val, delimiter=",")
    np.savetxt(path / "y_val.csv", y_val, delimiter=",")
    np.savetxt(path / "x_test.csv", x_test, delimiter=",")
    np.savetxt(path / "y_test.csv", y_test, delimiter=",")

    with open(path / "metadata.yaml", "w") as f:
        yaml.dump(
            {
                "train_split": train_test_params,
                "val_split": train_val_test_params,
                "scaler": None,
                "stratify": stratify,
                "n_train": int(y_train.shape[0]),
                "n_val": int(y_val.shape[0]),
                "n_test": int(y_test.shape[0]),
                "n_total": x.shape[0],
                "n_features": x.shape[1],
                "tts_random_state": tts_random_state,
                "tvs_random_state": tvs_random_state,
            },
            f,
        )


def preprocess_independent(
    data_dict: dict[str | int, dict[str, np.ndarray]],
    path: Path,
    stratify: bool,
) -> None:
    for task_id, task_dict in tqdm(data_dict.items()):
        x = task_dict["x"]
        y = task_dict["y"]

        tmp_path = path / str(task_id)

        preprocess_shared(x, y, tmp_path, stratify)

##### Import datasets

In [4]:
# load sarcos


root_url = "https://gaussianprocess.org/gpml/data/{}.mat"

train_path = SARCOS_PATH / "raw" / "sarcos_inv.mat"
test_path = SARCOS_PATH / "raw" / "sarcos_inv_test.mat"

train_path.parent.mkdir(parents=True, exist_ok=True)

urllib.request.urlretrieve(root_url.format("sarcos_inv"), train_path)
urllib.request.urlretrieve(root_url.format("sarcos_inv_test"), test_path)


sarcos_train = loadmat(train_path)["sarcos_inv"]
sarcos_test = loadmat(test_path)["sarcos_inv_test"]

sarcos = np.concatenate([sarcos_train, sarcos_test], axis=0)
del sarcos_train, sarcos_test

x = sarcos[:, :-7]
y = sarcos[:, -7:]

print(sarcos.shape)

del sarcos

preprocess_shared(x, y, SARCOS_PATH / "processed", stratify=False)

(48933, 28)


In [5]:
from ml_utils.utils import comb_iterator

lst_datasets = [
    NPLogP(drop_corr=False),
    NPZetaP(drop_corr=False),
    MolBoil(drop_corr=False),
    MolHenry(drop_corr=False),
    MolLogP(drop_corr=False),
    MolMelt(drop_corr=False),
    ProtSol(drop_corr=False),
]

x_vals, y_vals, indices, names = [], [], [], []
shared_columns = set(lst_datasets[0].features)
for dataset in lst_datasets[1:]:
    shared_columns &= set(dataset.features)

# Use only the shared features
for dataset in lst_datasets:
    column_mask = np.array([feature in shared_columns for feature in dataset.features])
    x, y, index = dataset.get_data()
    x_vals.append(x[:, column_mask])
    y_vals.append(y)
    indices.append(index)
    names.append(dataset.name)

    print(dataset.name, x.shape)

# Since there are some samples that are shared between datasets, we need to make
# sure that we properly stratify the splits
lst_subsets = list(comb_iterator(list(map(set, indices))))

NP LogP (147, 3517)
NP ZetaP (206, 3478)
Mol Boil (1185, 1554)
Mol Henry (777, 1711)
Mol LogP (11079, 1884)
Mol Melt (2143, 1795)
Protein Sol (3071, 3896)


In [6]:
keys = ["x_train", "y_train", "x_val", "y_val", "x_test", "y_test"]

split_datasets = {k: {n: [] for n in names} for k in keys}

# Perform a stratified split for each subset of shared samples
for i, subset in enumerate(lst_subsets):
    mask = {k: np.zeros(len(idx), dtype=bool) for k, idx in zip(names, indices)}

    for name, idx in zip(names, indices):
        for k, val in enumerate(idx):
            if val in subset:
                mask[name][k] = True

    for name, x, y in zip(names, x_vals, y_vals):
        if np.sum(mask[name]) == 0:
            continue

        x_train, x_test, y_train, y_test = train_test_split(
            x[mask[name]], y[mask[name]], **train_test_params, random_state=random_state
        )

        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, **train_val_test_params, random_state=random_state
        )

        split_datasets["x_train"][name].append(x_train)
        split_datasets["y_train"][name].append(y_train)
        split_datasets["x_val"][name].append(x_val)
        split_datasets["y_val"][name].append(y_val)
        split_datasets["x_test"][name].append(x_test)
        split_datasets["y_test"][name].append(y_test)

for key in keys:
    for name, vals in split_datasets[key].items():
        split_datasets[key][name] = np.concatenate(vals, axis=0)

In [7]:
# Write the data to disk
for name in names:
    for key in keys:
        tmp_name = name.replace(" ", "_").lower()
        path = DATA_PATH / "nanoparticle" / "processed" / tmp_name
        path.mkdir(parents=True, exist_ok=True)

        np.savetxt(
            path / f"{key}.csv",
            split_datasets[key][name],
            delimiter=",",
        )

In [8]:
split_datasets["x_train"]["NP LogP"].shape

(102, 1205)