# Vpop Generation with deep learning

## Introduction

The goal of this notebook is to show the potential of Invertible Neural Networks for population-based calibration.

It is divided in 2 parts :
- building of the material needed to train the INN
- Population-based calibration using the inverse model.

Linked resources: 
- [Analyzing Inverse Problems with Invertible Neural Networks](https://arxiv.org/pdf/1808.04730)
- [Gitlab project hosting internship work](https://git.novadiscovery.net/jinko/population-based-calibration)
- [Folder on jinko](https://jinko.ai/project/e0fbb5bb-8929-439a-bad6-9e12d19d9ae4?labels=738604cf-1246-4a75-80aa-907370b22273)
- [Computational model](https://jinko.ai/cm-VcQQ-CPjb) already on Jinko.ai (Cholesterol model in this example).

In [None]:
# Jinko specifics imports & initialization
# Please fold this section and do not change
import jinko_helpers as jinko

# Connect to Jinko (see README.md for more options)
jinko.initialize()

In [None]:
# Cookbook specifics imports
from jinko_stats.calibration import INNCalibrator, INN, Subloss
import os
import json
import zipfile
import io

# Cookbook specifics constants:
# put here the constants that are specific to your cookbook like
# the reference to the Jinko items, the name of the model, etc.

folder_id = "3011250d-9fbd-4b55-a765-11dbb89e2113"
computational_model_sid = "cm-VcQQ-CPjb"
vpop_design_sid = "vd-0aaQ-VWCM"
measure_design_sid = "md-Mftn-UmB9"  # optional

training_material_dir = os.path.normpath(
    "resources/vpop_generation_with_deep_learning/training_material"
)
output_dir = os.path.normpath("resources/vpop_generation_with_deep_learning/outputs")
inn_models_dir = os.path.normpath(
    "resources/vpop_generation_with_deep_learning/inn_models"
)
# Check if the directory exists, if not, create it
if not os.path.exists(training_material_dir):
    os.makedirs(training_material_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(inn_models_dir):
    os.makedirs(inn_models_dir)

# Step 1: Create the training material

To train the invertible neural network, we need a first set of patients and their associated scalar results of interests. 

To do so we will:
 - post a vpop design and generate a vpop.
 - post measures. The scalars of interests will come from those measures
 - post the trial with the compuational model, the vpop and the measures then run it. 
 - download the scalar results

In [None]:
# Get the model name

model_info = jinko.get_project_item(computational_model_sid)
model_core_item_id = model_info["coreId"]["id"]
model_snapshot_id = model_info["coreId"]["snapshotId"]
model_name = model_info["name"]

Note on the vpop design:

The design will be used to create the training vpop.
The intervals should be large enough to allow for exploration of the space.  
However, if you have a good understanding of the prior value of a parameter, you can make it narrower.

In [None]:
# Get the vpop design

vpop_design_id = jinko.get_core_item_id(vpop_design_sid)
vpop_design_core_item_id = vpop_design_id["id"]
vpop_design_snapshot_id = vpop_design_id["snapshotId"]

vpop_design = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop_generator/{vpop_design_core_item_id}/snapshots/{vpop_design_snapshot_id}",
).json()

print(
    f"Resource was correctly pulled from: {jinko.get_project_item_url_from_sid(vpop_design_sid)}"
)

vpop_design_file = os.path.join(training_material_dir, "vpop_design.json")

with open(vpop_design_file, "w") as f:
    json.dump(vpop_design, f)
print(f"Vpop Design saved as {vpop_design_file}")

In [None]:
# Create the training vpop from the vpop design

# set here the size of the training vpop.
# the more inputs you want to calibrate, the more patients are neeeded to explore the space
VPOP_SIZE = 10000

response = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop_generator/{vpop_design_core_item_id}/snapshots/{vpop_design_snapshot_id}/vpop",
    method="POST",
    json={
        "contents": {
            "computationalModelId": {
                "coreItemId": model_core_item_id,
                "snapshotId": model_snapshot_id,
            },
            "size": VPOP_SIZE,
        },
        "tag": "VpopGeneratorOptionsForVpopDesign",
    },
    options={
        "name": f"vpop for {model_name}",
        "folder_id": folder_id,
    },
)
vpop_train_info = jinko.get_project_item_info_from_response(response)
vpop_train_core_item_id = vpop_train_info["coreItemId"]["id"]
vpop_train_snapshot_id = vpop_train_info["coreItemId"]["snapshotId"]

print(f"Resource link: {jinko.get_project_item_url_from_response(response)}")

# download the created vpop

vpop_train_file = os.path.join(training_material_dir, "vpop_train.csv")

response = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop/{vpop_train_core_item_id}",
    options={"output_format": "text/csv"},
)

if response.status_code == 200:
    with open(vpop_train_file, "wb") as f:
        f.write(response.content)
    print(f"CSV file saved as {vpop_train_file}")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

In [None]:
# Optional : get the measure design
# If you want custom measures in the trial to be used as targets, use a measure design

measure_design_id = jinko.get_core_item_id(measure_design_sid)
measure_design_core_item_id = measure_design_id["id"]
measure_design_snapshot_id = measure_design_id["snapshotId"]

measure_design = jinko.make_request(
    path=f"/core/v2/scorings_manager/measure_design/{measure_design_core_item_id}/snapshots/{measure_design_snapshot_id}",
).json()

print(
    f"Resource was correctly pulled from: {jinko.get_project_item_url_from_sid(measure_design_sid)}"
)

measure_design_file = os.path.join(training_material_dir, "measure_design.json")

with open(measure_design_file, "w") as f:
    json.dump(measure_design, f)
print(f"Measure design saved as {measure_design_file}")

In [None]:
# Post a trial

# Define the data payload
trial_data_payload = {
    "computationalModelId": {
        "coreItemId": model_core_item_id,
        "snapshotId": model_snapshot_id,
    },
    "vpopId": {
        "coreItemId": vpop_train_core_item_id,
        "snapshotId": vpop_train_snapshot_id,
    },
    "measureDesignId": {
        "coreItemId": measure_design_core_item_id,
        "snapshotId": measure_design_snapshot_id,
    },
}

response = jinko.make_request(
    path="/core/v2/trial_manager/trial",
    method="POST",
    json=trial_data_payload,
    options={
        "name": f"trial for {model_name}",
        "folder_id": folder_id,
    },
)

project_item_info = jinko.get_project_item_info_from_response(response)
trial_core_item_id = project_item_info["coreItemId"]["id"]
trial_snapshot_id = project_item_info["coreItemId"]["snapshotId"]

print(f"Resource link: {jinko.get_project_item_url_from_response(response)}")

In [None]:
# Run the trial
response = jinko.make_request(
    path=f"/core/v2/trial_manager/trial/{trial_core_item_id}/snapshots/{trial_snapshot_id}/run",
    method="POST",
)

jinko.monitor_trial_until_completion(trial_core_item_id, trial_snapshot_id)

In [None]:
# Download the results to build the the training data base

response = jinko.make_request(
    f"/core/v2/trial_manager/trial/{trial_core_item_id}/snapshots/{trial_snapshot_id}/results_summary"
)
response_summary = json.loads(response.content)

# Store the list of scalars that are scalar results and that come from measures: only those scalars will be targets, they are the outputs of the model
# This can be changed if you want to use other outputs
result_scalars = [
    scalar["id"]
    for scalar in response_summary["scalars"]
    if "Custom" in scalar["type"]["labels"]
]
print("List of result scalars used for targets:\n", result_scalars, "\n")

# file to store the concatenation of the vpop and their scalar results
scalar_results_train_file = os.path.join(
    training_material_dir, "scalar_results_train.csv"
)

try:
    print("Retrieving scalar results...")
    response = jinko.make_request(
        "/core/v2/result_manager/scalars_summary",
        method="POST",
        json={
            "select": result_scalars,
            "trialId": {
                "coreItemId": trial_core_item_id,
                "snapshotId": trial_snapshot_id,
            },
        },
    )
    if response.status_code == 200:
        print("Scalar results retrieved successfully.")
        with zipfile.ZipFile(io.BytesIO(response.content)) as archive:
            for file_info in archive.infolist():
                with archive.open(file_info) as extracted_file:
                    with open(scalar_results_train_file, "wb") as output_file:
                        output_file.write(extracted_file.read())
                        print(f"Saved scalar results as: {scalar_results_train_file}")
                break
    else:
        print(
            f"Failed to retrieve scalar results: {response.status_code} - {response.reason}"
        )
        response.raise_for_status()

except Exception as e:
    print(f"Error during scalar results retrieval or processing: {e}")
    raise

# Step 2: Train the Invertible Neural Network

In [None]:
# instantiate the Neural network

inputs_to_calibrate = len(vpop_design["contents"]["marginalDistributions"])

inn = INN(
    type="linear_batchnorm_leaky",
    N_DIM=inputs_to_calibrate,
    N_NODES=512,
    N_LAYERS=12,
    lr=0.001,
    inn_resource_dir=inn_models_dir,
)

In [None]:
train_set_file = os.path.join(training_material_dir, "train_set.csv")
validation_set_file = os.path.join(training_material_dir, "validation_set.csv")

inn.create_train_validation_set(
    vpop_train_file,
    scalar_results_train_file,
    vpop_design["contents"],
    train_set_file,
    validation_set_file,
)

We can visualize the training data base to see where are the ouputs of interests

In [None]:
inn.visualize_set_2d("training", normalized=False)

In [None]:
# inn.reset_inn() # To allow for retraining
inn.train(
    n_epochs=20,
    batch_size=32,
    sublosses_dict={Subloss.OUTPUT_MSE: 1, Subloss.INVERSE_MSE: 1, Subloss.UNIT: 0.01},
)

we can visualize the training epochs vs the accuracy

In [None]:
inn.plot_training_epochs()

# Step 3 : Generate a calibrated Vpop

In [None]:
# Define the outputs target distributions

objectives = [
    {"id": "ln_HDL", "distribution": {"mean": 3, "tag": "Normal", "stdev": 0.5}},
    {
        "id": "ln_LDL",
        "distribution": {
            "lowBound": 7,
            "highBound": 8,
            "tag": "Uniform",
        },
    },
]

In [None]:
# instantiate the Calibrator for the initial vpop design and the objectives
calibrator = INNCalibrator(vpop_design["contents"], objectives, inn)

In [None]:
scalar_results_test = os.path.join(output_dir, "scalar_results_test.csv")
number_of_patients = 1000
ratio = 2  # we generate twice more samples than the number of patients we want in the end in case some are filtered

calibrator.create_db_from_output_law(
    objectives=objectives,
    filesave=scalar_results_test,
    n_samples=number_of_patients * ratio,
    reuse=False,
)

In [None]:
vpop_test_file = os.path.join(output_dir, "vpop_test.csv")

calibrator.calibrate_from_output(
    inn,
    number_of_patients,  # Number of patients in the output Vpop.
    filesave=vpop_test_file,
    viz=False,
    filter_interval=True,  # Remove patient that do not respect the previous intervals
    sigma=3,  # Remove patients with INN results more than 3 std dev away.
    dropna=True,  # Remove patients that have NAs.
    denormalize_input=True,  # If inputs have been normalized for training.
    ratio=ratio,  # We sample 2 * number_of_patients from the db to account for filtering
    set_to_zero=False,  # Sample the supplementary variables from a normal or set them to 0.
)

In [None]:
# Post the Vpop

with open(vpop_test_file, "r") as file:
    vpop_calibrated = file.read()

response = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop",
    method="POST",
    csv_data=vpop_calibrated,
    options={"name": f"calibrated vpop for {model_name}", "folder_id": folder_id},
)

project_item_info = jinko.get_project_item_info_from_response(response)
vpop_test_core_item_id = project_item_info["coreItemId"]["id"]
vpop_test_snapshot_id = project_item_info["coreItemId"]["snapshotId"]

print(f"Resource link: {jinko.get_project_item_url_from_response(response)}")

In [None]:
# patch the main trial

response = jinko.make_request(
    path=f"/core/v2/trial_manager/trial/{trial_core_item_id}/snapshots/{trial_snapshot_id}",
    method="PATCH",
    json={
        "vpopId": {
            "coreItemId": vpop_test_core_item_id,
            "snapshotId": vpop_test_snapshot_id,
        }
    },
    options={
        "folder_id": folder_id,
    },
)

project_item_info = jinko.get_project_item_info_from_response(response)
trial_calibrated_core_item_id = project_item_info["coreItemId"]["id"]
trial_calibrated_snapshot_id = project_item_info["coreItemId"]["snapshotId"]

print(f"Resource link: {jinko.get_project_item_url_from_response(response)}")

In [None]:
# Run the trial
response = jinko.make_request(
    path=f"/core/v2/trial_manager/trial/{trial_calibrated_core_item_id}/snapshots/{trial_calibrated_snapshot_id}/run",
    method="POST",
)

# get the status
jinko.monitor_trial_until_completion(
    trial_calibrated_core_item_id, trial_calibrated_snapshot_id
)

In [None]:
# Download the results

scalar_results_test_simulated_file = os.path.join(
    output_dir, "scalar_results_test_simulated.csv"
)

try:
    print("Retrieving scalar results...")
    response = jinko.make_request(
        "/core/v2/result_manager/scalars_summary",
        method="POST",
        json={
            "select": result_scalars,
            "trialId": {
                "coreItemId": trial_calibrated_core_item_id,
                "snapshotId": trial_calibrated_snapshot_id,
            },
        },
    )
    if response.status_code == 200:
        print("Scalar results retrieved successfully.")
        # Extract the ZIP archive content
        with zipfile.ZipFile(io.BytesIO(response.content)) as archive:
            # Loop through the files in the ZIP
            for file_info in archive.infolist():
                # Extract the first file in the archive and write it with the desired name
                with archive.open(file_info) as extracted_file:
                    with open(scalar_results_test_simulated_file, "wb") as output_file:
                        output_file.write(extracted_file.read())
                        print(
                            f"Saved scalar results as: {scalar_results_test_simulated_file}"
                        )
                break  # Stop after the first file if only one is expected
    else:
        print(
            f"Failed to retrieve scalar results: {response.status_code} - {response.reason}"
        )
        response.raise_for_status()

except Exception as e:
    print(f"Error during scalar results retrieval or processing: {e}")
    raise

In [None]:
calibrator.visualize_generated_vpop_fit(
    vpop_test_file, scalar_results_test_simulated_file, scalar_results_train_file
)