# Sensitivity analysis on trial results (responder identification)

## Introduction

The purpose of this cookbook is to demonstrate how one can perform a sensitivy analysis on simulation outputs, in order to identify the patient descriptors that have the highest impact on a given response variable

In [None]:
# Jinko specifics imports & initialization
# Please fold this section and do not change
import jinko_helpers as jinko

# Connect to Jinko (see README.md for more options)
jinko.initialize()

In [None]:
# Cookbook specific imports
import io
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, lasso_path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import zipfile

# Cookbook specific constants:
# Put here the constants that are specific to your cookbook like
# the reference to the Jinko items, the name of the model, etc.

# @param {"name":"trialId", "type": "string"}
# The trial's short id can be retrieved in the url, pattern is `https://jinko.ai/<trail_sid>`
trial_sid = "tr-9Bid-BL1I"

# Step 1: Loading the trial

### Get the latest completed version

In [None]:
# Convert short id to core item id
trial_core_item_id = jinko.getCoreItemId(trial_sid, 1)

# List all trial versions
# https://doc.jinko.ai/api/#/paths/core-v2-trial_manager-trial-trialId--status/get
response = jinko.makeRequest(
    f'/core/v2/trial_manager/trial/{trial_core_item_id["id"]}/status'
)
versions = response.json()

# Get the latest completed version
try:
    latest_completed_version = next(
        (item for item in versions if item["status"] == "completed"), None
    )
    if latest_completed_version is None:
        raise Exception("No completed trial version found")
    else:
        print(
            "Successfully fetched this simulation:\n",
            json.dumps(latest_completed_version, indent=1),
        )
        simulation_id = latest_completed_version["simulationId"]
        trial_core_item_id = simulation_id["coreItemId"]
        trial_snapshot_id = simulation_id["snapshotId"]
except Exception as e:
    print(f"Error processing trial versions: {e}")
    raise

# Step 2 : Getting and post processing the trial results

### Displaying a summary of imported simulated data

In [None]:
# https://doc.jinko.ai/api/#/paths/core-v2-trial_manager-trial-trialId--snapshots--trialIdSnapshot--results_summary/get
response = jinko.makeRequest(
    f"/core/v2/trial_manager/trial/{trial_core_item_id}/snapshots/{trial_snapshot_id}/results_summary",
    method="GET",
)
response_summary = json.loads(response.content)

# Print a summary of the results content
print("Keys in the results summary:\n", list(response_summary.keys()), "\n")
print("Available patients:\n", response_summary["patients"], "\n")
print("Available arms:\n", response_summary["arms"], "\n")
print(
    "Available scalars:\n",
    [scalar["id"] for scalar in response_summary["scalars"]],
    "\n",
)
print(
    "Available cross-arm scalars:\n",
    [scalar["id"] for scalar in response_summary["scalarsCrossArm"]],
    "\n",
)
print(
    "Available categorical scalars:\n",
    [scalar["id"] for scalar in response_summary["categoricals"]],
    "\n",
)
print(
    "Available cross-arm categorical scalars:\n",
    [scalar["id"] for scalar in response_summary["categoricalsCrossArm"]],
    "\n",
)

# Store the list of scenario descriptors fetch them
scenario_descriptors = [
    scalar["id"]
    for scalar in (response_summary["scalars"] + response_summary["categoricals"])
    if "ScenarioOverride" in scalar["type"]["labels"]
]
print("List of scenario overrides:\n", scenario_descriptors, "\n")

### Retrieving scalar results

In [None]:
# https://doc.jinko.ai/api/#/paths/core-v2-result_manager-scalars_summary/post
response = jinko.makeRequest(
    path="/core/v2/result_manager/scalars_summary",
    method="POST",
    json={
        "select": [
            "Blood.Drug.max",
            "bloodFlowRate.tmin",
            "initialCountCancerCells.tmin",
            "initialTumorBurden.tmin",
            "kccCancerCell.tmin",
            "lymphaticFlowRate.tmin",
            "lymphDrainingRate.tmin",
            "tumorBurdenChangeFromBaseline.tend",
            "Tumor.CancerCell.tmin",
            "vmaxCancerCellDeath.tmin",
        ],
        "trialId": latest_completed_version["simulationId"],
    },
)
archive = zipfile.ZipFile(io.BytesIO(response.content))
filename = archive.namelist()[0]

csv_scalars = archive.read(filename).decode("utf-8")

In [None]:
# Loading scalars into a dataframe
df_scalars = pd.read_csv(io.StringIO(csv_scalars))
print("Raw scalar data (first rows):\n")
display(df_scalars.head())
print("\nNumber of patients in the original table:", len(df_scalars))

# Filtering patients (keeping only cross arm baseline values and IV 10mg dose)
df_scalars = df_scalars[df_scalars["armId"].isin(["crossArms", "iv-1-10"])]
print(f"\nNumber of patients in the table after filtering:", len(df_scalars))

# Pivoting to a wide format
df_scalars = df_scalars.drop("armId", axis=1)
df_scalars = df_scalars.pivot(index="patientId", columns="scalarId", values="value")

# Checking the result
print("\nPivoted scalar table (first rows):")
display(df_scalars.head())

# Creating a column for the response to the treatment (binary outcome)
df_scalars["responder"] = df_scalars["tumorBurdenChangeFromBaseline.tend"].apply(
    lambda x: x <= -95
)
print("\nResponse variable (first rows):")
display(df_scalars[["tumorBurdenChangeFromBaseline.tend", "responder"]].head())

# Checking if there are NaN values in the table
nan_rows = df_scalars.isna().any(axis=1)
id_to_remove = nan_rows[nan_rows].index.values
print(
    "\n",
    len(id_to_remove),
    "patient(s) containing NaN values in the table will be removed:",
)
display(df_scalars[df_scalars.index.isin(id_to_remove)])

# Removing corresponding row(s)
df_scalars = df_scalars.drop(index=id_to_remove)

# Step 3 : Defining explanatory and response variables
## 3.1 : Selecting variables of interest

In [None]:
feature_cols = [
    "Blood.Drug.max",
    "bloodFlowRate.tmin",
    "initialCountCancerCells.tmin",
    "initialTumorBurden.tmin",
    "kccCancerCell.tmin",
    "lymphaticFlowRate.tmin",
    "lymphDrainingRate.tmin",
    "Tumor.CancerCell.tmin",
    "vmaxCancerCellDeath.tmin",
]

X = df_scalars[feature_cols]  # Features
y_num = df_scalars["tumorBurdenChangeFromBaseline.tend"]  # Continuous target variable
y_bin = df_scalars["responder"]  # Binary target variable

## 3.2 : Splitting data into training and testing sets

In [None]:
# Printing the number of samples in each class
counts = df_scalars["responder"].value_counts()
print("Number of samples in each class:")
for label, count in counts.items():
    print(f"{label}: {count}")

# Splitting the data into training and testing sets
X_train, X_test, y_num_train, y_num_test, y_bin_train, y_bin_test = train_test_split(
    X, y_num, y_bin, test_size=0.3, random_state=16
)

## Step 4 : Applying feature selection approaches
### Penalized regression (LASSO)

In [None]:
# Define the pipeline
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),  # Step 1: Standardize the data
        ("lasso_cv", LassoCV(cv=5, random_state=0)),  # Step 2: Cross-validated Lasso
    ]
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_num_train)

# Extract the scaled training data from the pipeline
X_train_scaled = pipeline.named_steps["scaler"].transform(X_train)

# Searching for the best alpha value
best_alpha = pipeline.named_steps["lasso_cv"].alpha_
print(f"Best alpha: {best_alpha}")

# Predict using the pipeline
y_train_pred = pipeline.predict(X_train)

# Estimating the paths of coefficients using the same scaling
alphas_lasso, coefs, _ = lasso_path(
    X_train_scaled, y_num_train, alphas=pipeline.named_steps["lasso_cv"].alphas_
)

# Initializing figure
fig = go.Figure()

# Plotting each coefficient path
for i in range(coefs.shape[0]):
    fig.add_trace(
        go.Scatter(
            x=alphas_lasso,
            y=coefs[i, :],
            mode="lines",
            name=X.columns[i],
            hoverinfo="name+x+y",
        )
    )

# Adding a vertical line for the best alpha
fig.add_trace(
    go.Scatter(
        x=[best_alpha, best_alpha],
        y=[coefs.min(), coefs.max()],
        mode="lines",
        line=dict(color="black", dash="dash"),
        name="Best Alpha",
    )
)

# Updating the layout
fig.update_layout(
    title="Lasso coefficients as a function of alpha",
    xaxis_title="Alpha",
    yaxis_title="Coefficients",
    xaxis=dict(type="log", autorange="reversed"),
    template="plotly_white",
)

fig.show()

# Fitting the lasso model with the best alpha
lasso_final = Lasso(alpha=best_alpha).fit(X_train_scaled, y_num_train)

# Extracting the coefficients
lasso_coefficients = pd.Series(lasso_final.coef_, index=X_train.columns)
print("Lasso coefficients:")
display(lasso_coefficients)

# Filtering features with non-zero coefficients (i.e., selected features)
selected_features = lasso_coefficients[lasso_coefficients != 0]
print("\nSelected features only:")
display(selected_features)

## Random forest

In [None]:
# Define the pipeline
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),  # Step 1: Standardize the data
        (
            "rf",
            RandomForestRegressor(n_estimators=100, random_state=0),
        ),  # Step 2: Train Random Forest
    ]
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_num_train)

# Percentage of explained variance (R^2)
explained_variance = pipeline.score(X_train, y_num_train) * 100
print(f"Percentage of explained variance (R^2): {explained_variance:.2f}%")

# Extracting feature importances
rf_importances = pipeline.named_steps["rf"].feature_importances_

# Storing feature importances in a DataFrame
feature_importances = pd.DataFrame(
    {"Features": X_train.columns, "Importance": rf_importances}
)

# Sort by importance
feature_importances = feature_importances.sort_values(by="Importance", ascending=True)

# Display the feature importances using Plotly
fig = px.bar(
    feature_importances,
    x="Importance",
    y="Features",
    orientation="h",
    title="Explanatory variables ranked by importance",
)
fig.show()