In [2]:
import numpy as np
import pandas as pd

# Uses `Distribution`, `run_de`, and `fitness` in the file
from method import *

# Clean up the output
np.set_printoptions(precision=2, suppress=True)

# Random seed for reproducibility
SEED = 20241225

## Real data experiments in the paper
This file illustrates how to replicate the real data experiments, which is very similar to the instruction given in `Readme.md`

Assume that the CGM data from Shah et al. (2019) and Brown et al. (2019) are accessed and processed as guided in the [Awesome-CGM](https://github.com/IrinaStatsLab/Awesome-CGM) repository, saved in `./data/shah2019_filtered.csv` and `./data/brown2019_filtered.csv`. 

These datasets are measured by Dexcom G6, having glucose ranges from 39 mg/dL to 401 mg/dL. This range should be specified as `ran=(39, 401)` when creating a `Distribution` class in a data-dependent manner.

As the differential evolution (DE) is a stochastic algorithm, we fix `SEED=20241225` to reproduce the experimental results illustrated in the paper. 

In [5]:
# Load data
data_shah = pd.read_csv("./data/shah2019_filtered.csv")
grouped_data = data_shah.groupby('id').agg({'gl': list}).reset_index()
data_class_shah = Distribution(grouped_data["gl"], ran=(39., 401.), M=200)

# data_brown = pd.read_csv("./data/brown2019_filtered.csv")
data_brown = pd.read_csv("./data/o_malley2021_filtered.csv")
grouped_data = data_brown.groupby('id').agg({'gl': list}).reset_index()
data_class_brown = Distribution(grouped_data["gl"], ran=(39., 401.), M=200)

After making data into `Distribution` classes, we can run `run_de` with specified target number of thresholds `K` and the threshold-optimality criteria: `loss="Loss1"` or `loss="Loss2"`. 

Below are example usages with $K=4$.

##### Shah dataset with K=4

In [6]:
# L_1 loss
best_cutoffs, min_loss = run_de(data_class_shah, K=4, loss="Loss1", seed=SEED)
print(best_cutoffs)
print("Obtained loss:", min_loss)

[ 39.    75.83 100.69 123.7  154.96 401.  ]
Obtained loss: 16.619439163631053


In [None]:
# L_2 loss (takes about 4 minutes)
best_cutoffs2, min_loss2 = run_de(data_class_shah, K=4, loss="Loss2", seed=SEED)
print(best_cutoffs2)
print("Obtained loss (x 10^{-3}):", min_loss2 / 1000)

[ 39.   168.89 249.13 294.49 351.83 401.  ]
Obtained loss (x 10^{-3}): 30.808015190380164


##### Brown dataset with K=4

In [None]:
# L_1 loss
best_cutoffs, min_loss = run_de(data_class_brown, K=4, loss="Loss1", seed=SEED)
print(best_cutoffs)
print("Obtained loss:", min_loss)

[ 39.    84.88 171.2  232.62 301.58 401.  ]
Obtained loss: 40.657398455283754


In [None]:
# L_2 loss (takes about 5 minutes)
best_cutoffs2, min_loss2 = run_de(data_class_brown, K=4, loss="Loss2", seed=SEED)
print(best_cutoffs2)
print("Obtained loss (x 10^{-3}):", min_loss2 / 1000)

[ 39.   168.89 249.13 294.49 351.83 401.  ]
Obtained loss (x 10^{-3}): 30.808015190380164


### Optimality measures at the traditional thresholds

In [10]:
print("Shah dataset")
print("    L1 at two traditional:", fitness([70, 181], data_class_shah, loss="Loss1"))
print("    L1 at four traditional:", fitness([54, 70, 181, 251], data_class_shah, loss="Loss1"))

# Precompute the Wasserstein distance matrix for L2 loss calculation
data_class_shah.Wdist_matrix()
print("    L2 at two traditional:", fitness([70, 181], data_class_shah, loss="Loss2") / 1000, "(multiplied by 10^{-3})")
print("    L2 at four traditional:", fitness([54, 70, 181, 251], data_class_shah, loss="Loss2") / 1000, "(multiplied by 10^{-3})")

print("\nBrown dataset")
print("    L1 at two traditional:", fitness([70, 181], data_class_brown, loss="Loss1"))
print("    L1 at four traditional:", fitness([54, 70, 181, 251], data_class_brown, loss="Loss1"))

# Precompute the Wasserstein distance matrix for L2 loss calculation
data_class_brown.Wdist_matrix()
print("    L2 at two traditional:", fitness([70, 181], data_class_brown, loss="Loss2") / 1000, "(multiplied by 10^{-3})")
print("    L2 at four traditional:", fitness([54, 70, 181, 251], data_class_brown, loss="Loss2") / 1000, "(multiplied by 10^{-3})")

Shah dataset
    L1 at two traditional: 656.8785795451477
    L1 at four traditional: 655.939314472095
    L2 at two traditional: 28.32808440253849 (multiplied by 10^{-3})
    L2 at four traditional: 28.801864648312193 (multiplied by 10^{-3})

Brown dataset
    L1 at two traditional: 1235.9789988634705
    L1 at four traditional: 159.90153344550367
    L2 at two traditional: 1450.9149694569626 (multiplied by 10^{-3})
    L2 at four traditional: 92.74154648603994 (multiplied by 10^{-3})
