In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import yaml
from sklearn.datasets import load_diabetes
from scipy.io import loadmat
import sys

sys.path.append(str(Path("..") / "datasets" / "general-descriptors-datasets"))

from chem_data import NPLogP, NPZetaP, ProtSol, MolLogP, MolHenry, MolBoil, MolMelt

DATA_PATH = Path("..") / "datasets"

SARCOS_PATH = DATA_PATH / "sarcos"
PARKINSONS_PATH = DATA_PATH / "parkinsons"

round_digits = 3

r = lambda x: round(x, round_digits)

In [2]:
train_frac, val_frac, test_frac = 0.7, 0.2, 0.1

sum_frac = round(train_frac + val_frac + test_frac, 10)
assert sum_frac == 1.0, sum_frac

random_state = np.random.RandomState(0)

# Note that this random state is used for all splits, so it changes over time
shared_params = {
    "shuffle": True,
}

train_test_params = {
    **shared_params,
    "train_size": r(train_frac + val_frac),
    "test_size": r(test_frac),
}

train_val_test_params = {
    **shared_params,
    "train_size": r(train_frac / (1 - test_frac)),
    "test_size": r(val_frac / (1 - test_frac)),
}

print(train_test_params)
print(train_val_test_params)

{'shuffle': True, 'train_size': 0.9, 'test_size': 0.1}
{'shuffle': True, 'train_size': 0.778, 'test_size': 0.222}


In [3]:
# TODO: These should be combined with process_pmlb.ipynb


def get_state(random_state: np.random.RandomState) -> list:
    state = random_state.get_state()

    # cast the state to something that is yaml serializable
    return [
        str(state[0]),
        list(map(int, state[1])),
        int(state[2]),
        int(state[3]),
        float(state[4]),
    ]


def preprocess_shared(
    x: np.ndarray,
    y: np.ndarray,
    path: Path,
    stratify: bool,
) -> None:
    tts_random_state = get_state(random_state)
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        **train_test_params,
        stratify=y if stratify else None,
        random_state=random_state,
    )

    tvs_random_state = get_state(random_state)
    x_train, x_val, y_train, y_val = train_test_split(
        x_train,
        y_train,
        **train_val_test_params,
        stratify=y_train if stratify else None,
        random_state=random_state,
    )

    path.mkdir(parents=True, exist_ok=True)

    np.savetxt(path / "x_train.csv", x_train, delimiter=",")
    np.savetxt(path / "y_train.csv", y_train, delimiter=",")
    np.savetxt(path / "x_val.csv", x_val, delimiter=",")
    np.savetxt(path / "y_val.csv", y_val, delimiter=",")
    np.savetxt(path / "x_test.csv", x_test, delimiter=",")
    np.savetxt(path / "y_test.csv", y_test, delimiter=",")

    with open(path / "metadata.yaml", "w") as f:
        yaml.dump(
            {
                "train_split": train_test_params,
                "val_split": train_val_test_params,
                "scaler": None,
                "stratify": stratify,
                "n_train": int(y_train.shape[0]),
                "n_val": int(y_val.shape[0]),
                "n_test": int(y_test.shape[0]),
                "n_total": x.shape[0],
                "n_features": x.shape[1],
                "tts_random_state": tts_random_state,
                "tvs_random_state": tvs_random_state,
            },
            f,
        )


def preprocess_independent(
    data_dict: dict[str | int, dict[str, np.ndarray]],
    path: Path,
    stratify: bool,
) -> None:
    for task_id, task_dict in tqdm(data_dict.items()):
        x = task_dict["x"]
        y = task_dict["y"]

        tmp_path = path / str(task_id)

        preprocess_shared(x, y, tmp_path, stratify)

In [11]:
# load sarcos

sarcos_train = loadmat(SARCOS_PATH / "raw" / "sarcos_inv.mat")["sarcos_inv"]
sarcos_test = loadmat(SARCOS_PATH / "raw" / "sarcos_inv_test.mat")["sarcos_inv_test"]

sarcos = np.concatenate([sarcos_train, sarcos_test], axis=0)
del sarcos_train, sarcos_test

# x = sarcos[:, :-7]
# y = sarcos[:, -7:]

# del sarcos

# preprocess_shared(x, y, SARCOS_PATH / "processed", stratify=False)

In [12]:
sarcos.shape

(48933, 28)

In [14]:
# load parkinsons

parkinsons = pd.read_csv(PARKINSONS_PATH / "raw" / "parkinsons_updrs.data")

# y = parkinsons.pop("total_UPDRS").values
pred_column = "total_UPDRS"
drop_columns = [
    "subject#",  # utique identifier, common for each subject
    "motor_UPDRS",  # probably highly correlated with total_UPDRS
    "total_UPDRS",  # target
    "test_time",  # highly correlated with UPDRS because it's time series data
]

tasks = {}
for subject_id, subject_data in parkinsons.groupby("subject#"):
    y = subject_data[pred_column].values
    x = subject_data.drop(drop_columns, axis=1).values

    tasks[subject_id] = {"x": x, "y": y}

# preprocess_independent(tasks, PARKINSONS_PATH / "processed", stratify=False)

In [19]:
sizes = [t["x"].shape[0] for t in tasks.values()]
np.min(sizes), np.max(sizes), tasks[1]["x"].shape[1]

(101, 168, 18)

In [6]:
parkinsons

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,0.000034,0.00401,0.00317,...,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,1,72,0,12.6660,28.447,34.894,0.00300,0.000017,0.00132,0.00150,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,1,72,0,19.6810,28.695,35.389,0.00481,0.000025,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.6470,28.905,35.810,0.00528,0.000027,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,1,72,0,33.6420,29.187,36.375,0.00335,0.000020,0.00093,0.00130,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,42,61,0,142.7900,22.485,33.485,0.00406,0.000031,0.00167,0.00168,...,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,42,61,0,149.8400,21.988,32.988,0.00297,0.000025,0.00119,0.00147,...,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,42,61,0,156.8200,21.495,32.495,0.00349,0.000025,0.00152,0.00187,...,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,42,61,0,163.7300,21.007,32.007,0.00281,0.000020,0.00128,0.00151,...,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


In [5]:
from ml_utils.utils import comb_iterator

lst_datasets = [
    NPLogP(drop_corr=False),
    NPZetaP(drop_corr=False),
    MolBoil(drop_corr=False),
    MolHenry(drop_corr=False),
    MolLogP(drop_corr=False),
    MolMelt(drop_corr=False),
    ProtSol(drop_corr=False),
]

x_vals, y_vals, indices, names = [], [], [], []
shared_columns = set(lst_datasets[0].features)
for dataset in lst_datasets[1:]:
    shared_columns &= set(dataset.features)

for dataset in lst_datasets:
    column_mask = np.array([feature in shared_columns for feature in dataset.features])
    x, y, index = dataset.get_data()
    x_vals.append(x[:, column_mask])
    y_vals.append(y)
    indices.append(index)
    names.append(dataset.name)

    print(dataset.name, x.shape)

lst_subsets = list(comb_iterator(list(map(set, indices))))

NP LogP (147, 3517)
NP ZetaP (206, 3478)
Mol Boil (1185, 1554)
Mol Henry (777, 1711)
Mol LogP (11079, 1884)
Mol Melt (2143, 1795)
Protein Sol (3071, 3896)


In [6]:
keys = ["x_train", "y_train", "x_val", "y_val", "x_test", "y_test"]

split_datasets = {k: {n: [] for n in names} for k in keys}

for i, subset in enumerate(lst_subsets):
    mask = {k: np.zeros(len(idx), dtype=bool) for k, idx in zip(names, indices)}

    for name, idx in zip(names, indices):
        for k, val in enumerate(idx):
            if val in subset:
                mask[name][k] = True

    for name, x, y in zip(names, x_vals, y_vals):
        if np.sum(mask[name]) == 0:
            continue

        x_train, x_test, y_train, y_test = train_test_split(
            x[mask[name]], y[mask[name]], **train_test_params, random_state=random_state
        )

        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, **train_val_test_params, random_state=random_state
        )

        split_datasets["x_train"][name].append(x_train)
        split_datasets["y_train"][name].append(y_train)
        split_datasets["x_val"][name].append(x_val)
        split_datasets["y_val"][name].append(y_val)
        split_datasets["x_test"][name].append(x_test)
        split_datasets["y_test"][name].append(y_test)

for key in keys:
    for name, vals in split_datasets[key].items():
        split_datasets[key][name] = np.concatenate(vals, axis=0)

In [9]:
for name in names:
    for key in keys:
        tmp_name = name.replace(" ", "_").lower()
        path = DATA_PATH / "nanoparticle" / "processed" / tmp_name
        path.mkdir(parents=True, exist_ok=True)

        np.savetxt(
            path / f"{key}.csv",
            split_datasets[key][name],
            delimiter=",",
        )

    # todo: have to save metadata as well

In [10]:
split_datasets["x_train"]["NP LogP"].shape

(102, 1205)