# Subsampling

See the corresponding documentation on [jinkommunity](https://community.jinko.ai/t/h7h4gf5/how-to-subsample-a-virtual-population)

In [None]:
# Jinko specifics imports & initialization
# Please fold this section and do not edit it
import sys

sys.path.insert(0, "../lib")
import jinko_helpers as jinko

# Connect to Jinko (see README.md for more options)
jinko.initialize()

In [None]:
# Cookbook specifics imports
import io
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
import textwrap

## Step 0: Select trial of interest

In [None]:
"""
trial_short_id can be retrieved from the URL of your trial in Jinko, pattern is `https://jinko.ai/<trial_short_id>`
"""

trial_short_id = "tr-dI79-x7V2"

# folder ID, pattern is `https://jinko.ai/project/<project_id>?labels=<folder_id>`
# This folder is where the subsampling designs and the subsampled Vpops will be saved,
# it does not have to be the same folder as that of the initial trial
folder_id = "8764d184-3a1f-4524-90bc-49af9e640bdb"

## Step 1: Pick a trial version

In [None]:
# Choose a specific revision. By default we return the last version
revision = 26
# Choose a specific label. By default we return the last version
label = None
response = jinko.get_project_item_new(
    sid=trial_short_id, revision=revision, label=label
)
trial_core_item_id, trial_snapshot_id = (
    response["coreId"]["id"],
    response["coreId"]["snapshotId"],
)

# # Uncomment the following if you want to use the latest completed or stopped version
# response = jinko.get_latest_calib_with_status(shortId=trial_short_id, statuses=["completed", "stopped"])
# trial_core_item_id, trial_snapshot_id = response["coreItemId"], response["snapshotId"]

print(
    f"Picked Trial with coreItemId: {trial_core_item_id}, snapshotId: {trial_snapshot_id}"
)
trial_link = jinko.get_project_item_url_from_sid(trial_short_id)
trial_link_with_revision = (
    f"{trial_link}?revision={revision}" if revision else trial_link
)
print(f"Trial link: {trial_link_with_revision}")

## Step 2: Define the subsampling design

In [None]:
vpop_generator_payload = {
    "contents": {
        "trialId": {
            "coreItemId": trial_core_item_id,
            "snapshotId": trial_snapshot_id,
        },
        "filters": [
            {
                "tag": "DescriptorFilter",
                "contents": [
                    {
                        "descriptorId": "isResponse.tend",
                        "arm": "DoubleDose",
                        "operator": "Gte",
                        "value": 0.5,
                    }
                ],
            },
            {
                "tag": "CategoricalDescriptorFilter",
                "contents": [
                    {
                        "descriptorId": "sex",
                        "arm": "crossArms",
                        "levels": ["female"],
                    }
                ],
            },
        ],
        "targetMarginals": [
            {
                "arm": "SingleDose",
                "distribution": {
                    "tag": "LogNormal",
                    "mean": -4,
                    "stdev": 0.1,
                    "base": 10,
                },
                "id": "Blood.Drug-avg-from-PT0S-to-P100D",
                "weight": 1,
            },
            {
                "arm": "DoubleDose",
                "distribution": {"tag": "Uniform", "lowBound": 2e-4, "highBound": 4e-4},
                "id": "Blood.Drug-avg-from-PT0S-to-P100D",
                "weight": 1,
            },
        ],
        "targetCategoricals": [
            {
                "id": "origin",
                "arm": "crossArms",
                "distribution": {
                    "tag": "Categorical",
                    "catMapping": {
                        "EU": 0.6,
                        "APAC": 0.2,
                        "US": 0.2,
                    },
                },
                "weight": 1,
            }
        ],
        "targetCorrelations": [
            {
                "correlateX": {
                    "arm": None,
                    "id": "kClearanceDrug.tmin",
                },
                "correlateY": {
                    "arm": "DoubleDose",
                    "id": "tumorBurden.tend",
                },
                "correlationCoefficient": 0.5,
                "weight": 0.1,
            }
        ],
        "targetSurvivals": [
            {
                "timeToEventScalarId": "timeOfTumorReduction",
                "arm": "SingleDose",
                "timeVals": [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                "cumulativeSurvivalRates": [
                    0.9,
                    0.9,
                    0.6,
                    0.5,
                    0.4,
                    0.2,
                    0.15,
                    0.1,
                    0.05,
                    0,
                ],
                "timeUnit": "day",
                "weight": 1,
            }
        ],
        "targetSummaryStatistics": [
            {
                "arm": "Control",
                "id": "Tumor.CancerCell-at-P100D",
                "mean": 1e11,
                "standardDeviation": 2e10,
                "weight": 1,
            }
        ],
        "additionalScalars": [
            {"id": "tumorBurden.tend", "arm": "DoubleDose"},
            {"id": "kClearanceDrug.tmin", "arm": None},
            {"id": "timeOfTumorReduction", "arm": "SingleDose"},
            {"id": "Tumor.CancerCell-at-P100D", "arm": "Control"},
            {"id": "origin", "arm": None},
            {"id": "sex", "arm": None},
        ],
    },
    "tag": "FromSubsamplingGenerator",
}

if "subsampling_core_item_id" in globals() and "subsampling_snapshot_id" in globals():
    response = jinko.make_request(
        path=f"/core/v2/vpop_manager/vpop_generator/{subsampling_core_item_id}",
        method="PUT",
        json=vpop_generator_payload,
    )
    project_item_info = jinko.get_project_item_info_from_response(response)
    subsampling_core_item_id = project_item_info["coreItemId"]["id"]
    subsampling_snapshot_id = project_item_info["coreItemId"]["snapshotId"]
    subsampling_url = jinko.get_project_item_url_from_response(response)
    print(f"Subsampling design: {subsampling_url}")
else:
    response = jinko.make_request(
        path="/core/v2/vpop_manager/vpop_generator",
        method="POST",
        json=vpop_generator_payload,
        options={"name": "Subsampling Design", "folder_id": folder_id},
    )
    project_item_info = jinko.get_project_item_info_from_response(response)
    subsampling_core_item_id = project_item_info["coreItemId"]["id"]
    subsampling_snapshot_id = project_item_info["coreItemId"]["snapshotId"]
    subsampling_url = jinko.get_project_item_url_from_response(response)
    print(f"Subsampling design: {subsampling_url}")

## Step 3: Run the subsampling design

In [None]:
# Set the subsampling options
subsampling_payload = {
    "tag": "VpopGeneratorOptionsForSubsampling",
    "contents": {
        "numSamples": 50,
        "seed": 0,
        "method": {
            "numIterations": 10000,
            "itersFixedTemperature": 20,
            "replacementRate": 0.01,
            "boltzmannConstant": 0.001,
            "tag": "SimulatedAnnealing",
        },
    },
}

response = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop_generator/{subsampling_core_item_id}/snapshots/{subsampling_snapshot_id}/vpop",
    method="POST",
    json=subsampling_payload,
    options={"name": f"Subsampled Vpop", "folder_id": folder_id},
)
subsampled_vpop_core_item_id = response.json()["coreItemId"]
fitness = response.json()["subsamplingFitness"]
print(f"Subsampled vpop: {jinko.get_project_item_url_from_response(response)}")

## Step 4: Assess the subsampling goodness of fit

In [None]:
print(f"Visualize the goodness of fit here: {subsampling_url}")
goodness_dataframe = pd.DataFrame(
    [
        {"qoi": x["id"], "arm": x["arm"], "score": x["score"], "targetType": "marginal"}
        for x in fitness["marginals"]
    ]
    + [
        {
            "qoi": x["id"],
            "arm": x["arm"],
            "score": x["score"],
            "targetType": "categorical",
        }
        for x in fitness["categoricals"]
    ]
    + [
        {"qoi": x["id"], "arm": x["arm"], "score": x["score"], "targetType": "survival"}
        for x in fitness["survivals"]
    ]
    + [
        {
            "qoi": x["id"],
            "arm": x["arm"],
            "score": x["score"],
            "targetType": "summary statistics",
        }
        for x in fitness["summaryStatistics"]
    ]
    + [
        {
            "qoi": x["correlateX"]["id"],
            "arm": x["correlateX"].get("arm"),
            "score": x["score"],
            "targetType": "correlation",
            "qoi2": x["correlateY"]["id"],
            "arm2": x["correlateY"].get("arm"),
        }
        for x in fitness["correlations"]
    ]
)
display(goodness_dataframe)
print(f"Weighted score = {fitness["weightedScore"]:.3g}")

## Step 5: Compare the initial and subsampled vpops

In [None]:
initial_trial = jinko.make_request(
    path=f"/core/v2/trial_manager/trial/{trial_core_item_id}/snapshots/{trial_snapshot_id}",
    method="GET",
).json()
initial_vpop_core_item_id = initial_trial["vpopId"]["coreItemId"]

initial_vpop = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop/{initial_vpop_core_item_id}",
    method="GET",
    options={"output_format": "text/csv"},
)
initial_vpop_df = pd.read_csv(io.StringIO(str(initial_vpop.content, "utf-8")))

# Get the subsampled Vpop
subsampled_vpop = jinko.make_request(
    path=f"/core/v2/vpop_manager/vpop/{subsampled_vpop_core_item_id}",
    method="GET",
    options={"output_format": "text/csv"},
)
subsampled_vpop_df = pd.read_csv(io.StringIO(str(subsampled_vpop.content, "utf-8")))

In [None]:
# Add here the parameters for which you want the plots to be in log scale
logParams = [
    "kClearanceDrug",
]

colnames = list(c for c in initial_vpop_df.columns if c != "patientIndex")
dimension = len(colnames)
colnames_wrapped = ["<br>".join(textwrap.wrap(t, width=30)) for t in colnames]
num_rows, num_cols = 3, 4
if num_rows * num_cols < dimension:
    raise Exception(
        f"Not enough rows and columns ({num_rows * num_cols = }) to plot {dimension} parameters"
    )
nbinsx = 21


def ij_to_k(i, j):
    return num_cols * i + j


fig = make_subplots(
    rows=num_rows,
    cols=num_cols,
    horizontal_spacing=0.01,
    vertical_spacing=0.1,
    subplot_titles=colnames_wrapped,
)
show_legend_log, show_legend_linear = True, True
for i in range(num_rows):
    for j in range(num_cols):
        k = ij_to_k(i, j)
        if k >= dimension:
            continue
        else:
            col_name = colnames[k]
            if col_name in logParams:
                init_x = np.log10(initial_vpop_df[col_name])
                subsampled_x = np.log10(subsampled_vpop_df[col_name])
                legend_group, legendgrouptitle_text = "log", "Log10-transformed"
                marker_color_init, marker_color_subsampled = "#89EBBF", "#73330C"
                showlegend = show_legend_log
            else:
                init_x = initial_vpop_df[col_name]
                subsampled_x = subsampled_vpop_df[col_name]
                legend_group, legendgrouptitle_text = "linear", "Linear scale"
                marker_color_init, marker_color_subsampled = "#EB89B5", "#330C73"
                showlegend = show_legend_linear

            ksresults = stats.ks_2samp(init_x, subsampled_x)
            fig.add_trace(
                go.Histogram(
                    name="Initial",
                    x=init_x,
                    histnorm="probability density",
                    nbinsx=nbinsx,
                    marker_color=marker_color_init,
                    legendgroup=legend_group,
                    legendgrouptitle_text=legendgrouptitle_text,
                    showlegend=showlegend,
                ),
                row=i + 1,
                col=j + 1,
            )
            fig.add_trace(
                go.Histogram(
                    name="Subsampled",
                    x=subsampled_x,
                    histnorm="probability density",
                    nbinsx=nbinsx,
                    marker_color=marker_color_subsampled,
                    legendgroup=legend_group,
                    showlegend=showlegend,
                    legendgrouptitle_text=legendgrouptitle_text,
                ),
                row=i + 1,
                col=j + 1,
            )
            fig.layout.annotations[
                k
            ].text += f"<br>KS={ksresults.statistic:.2f}, p={ksresults.pvalue:.2e}"
            if col_name in logParams:
                show_legend_log = False
                fig.update_xaxes(tickprefix="1e", row=i + 1, col=j + 1)
            else:
                show_legend_linear = False

fig.update_annotations(font_size=14)
figure_width = 1200
figure_height = 1000
fig.update_layout(
    font=dict(size=12),
    showlegend=True,
    bargap=0.15,
    width=figure_width,
    height=figure_height,
    template="plotly_white",
)
config = {
    "toImageButtonOptions": {
        "format": "png",  # one of png, svg, jpeg, webp
        "filename": "custom_image",
        "height": figure_height,
        "width": figure_width,
        "scale": 6,  # Multiply title/legend/axis/canvas sizes by this factor
    }
}
fig.show(config=config)