In [None]:
from datetime import datetime

import numpy as np
import pandas as pd
from pycox.datasets import support

We load the Study to Understand Prognoses Preferences Outcomes and Risks of Treatment (SUPPORT) dataset via the `pycox` package. More info on this dataset can be found [here](https://doi.org/10.1186/s12874-018-0482-1).

In [None]:
df = support.read_df()

We transform the age column to a date, such that this dataset contains categoricals, dates, floats and integers, to cover as much of our codebase as possible whilst remaining interesting.

In [None]:
ref_date = datetime(2023, 1, 1)
df["x0"] = ref_date - pd.to_timedelta(df["x0"] * 365.2425, unit="D")
df = df.rename(columns={"x0": "dob"})

We also want to create a string column from one of the categoricals. We arbitrarily choose `x3`.

In [None]:
_map = {x: chr(65 + int(x)) for x in df["x3"].unique().tolist()}
df["x3"] = df["x3"].map(_map)

We also want to create a new column consisting of a multi-modal Gaussian.

In [None]:
df

In [None]:
# Create a column sampled from a multi-modal gaussian distribution with means 1.0, 50.0 and 100.0 and stds 1.0, 2.0 and 3.0
means = [1.0, 50.0, 100.0]
stds = [1.0, 2.0, 3.0]
df["x14"] = np.concatenate([np.random.normal(means[i], stds[i], size=1) for i in np.random.randint(0, 3, df.shape[0])])

Next we randomly introduce missingness into the data, setting the seed such that other users can recreate the dataset we provide if desired.

In [None]:
np.random.seed(123)
mask = np.random.choice([True, False], size=df.shape, p=[0.1, 0.9])
mask[:, -1] = False  # exclude the `event` column from missingness
df[mask] = np.nan

Next we inspect the data:

In [None]:
df.info()
df.head()

Finally, we write the data to a csv file.

In [None]:
df.to_csv("../data/support.csv", index=False)