# Statistical Analysis on trial results

## Introduction

Goal of this cookbook is to illustrate how one can query trial results and run a statistical analysis on it. We'll use Logistic Regression example

Linked resources: 
- [Folder on jinko](https://jinko.ai/project/e0fbb5bb-8929-439a-bad6-9e12d19d9ae4?labels=98d0ccc1-5c91-4697-886a-bec1cdf8c899).

In [None]:
# Jinko specifics imports & initialization
# Please fold this section and do not change
import jinko_helpers as jinko

# Connect to Jinko (see README.md for more options)
jinko.initialize()

In [None]:
# Cookbook specifics imports

import io
import json
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
)
from sklearn.pipeline import make_pipeline
import zipfile

# Cookbook specifics constants:
# put here the constants that are specific to your cookbook like
# the reference to the Jinko items, the name of the model, etc.

# @param {"name":"trialId", "type": "string"}
# trial short id can be retrieved in the url, pattern is `https://jinko.ai/<trail_sid>`
trial_sid = "tr-HLRF-b0zW"

# Step 1: Load the trial

### Get the latest completed version

In [None]:
# Convert short id to core item id
trial_core_item_id = jinko.get_core_item_id(trial_sid, 1)

# List all trial versions
# https://doc.jinko.ai/api/#/paths/core-v2-trial_manager-trial-trialId--status/get
response = jinko.make_request(
    f'/core/v2/trial_manager/trial/{trial_core_item_id["id"]}/status'
)
versions = response.json()

# Get the latest completed version
try:
    latest_completed_version = next(
        (item for item in versions if item["status"] == "completed"), None
    )
    if latest_completed_version is None:
        raise Exception("No completed trial version found")
    else:
        print(
            "Successfully fetched this simulation:\n",
            json.dumps(latest_completed_version, indent=1),
        )
        simulation_id = latest_completed_version["simulationId"]
        trial_core_item_id = simulation_id["coreItemId"]
        trial_snapshot_id = simulation_id["snapshotId"]
except Exception as e:
    print(f"Error processing trial versions: {e}")
    raise

# Step 2 : Get and post process the trial results and p

### Display a summary of simulated data

In [None]:
response_summary = jinko.get_trial_scalars_summary(
    trial_core_item_id, trial_snapshot_id, print_summary=True
)

# Store the list of scenario descriptors fetch them
scenario_descriptors = [
    scalar["id"]
    for scalar in (response_summary["scalars"] + response_summary["categoricals"])
    if "ScenarioOverride" in scalar["type"]["labels"]
]
print("List of scenario overrides:\n", scenario_descriptors, "\n")

### Retrieve scalar results

In [None]:
df_scalars = jinko.get_trial_scalars_as_dataframe(
    trial_core_item_id,
    trial_snapshot_id,
    scalar_ids=[
        "Blood.Drug.max",
        "initialTumorBurden.tmin",
        "bloodFlowRate.tmin",
        "initialCountCancerCells.tmin",
        "lymphDrainingRate.tmin",
        "vmaxCancerCellDeath.tmin",
        "tumorBurdenChangeFromBaseline.tend",
        "lymphaticFlowRate.tmin",
    ],
)

In [None]:
print("Raw scalar data (first rows):\n")
display(df_scalars.head())
print("\nNumber of patients in the original table:", len(df_scalars))

# Filter patients (keeping only cross arm baseline values and IV 10mg dose)
df_scalars = df_scalars[df_scalars["armId"].isin(["crossArms", "iv-1-10"])]
print(f"\nNumber of patients in the table after filtering:", len(df_scalars))

# Pivot to a wide format
df_scalars = df_scalars.drop("armId", axis=1)
df_scalars = df_scalars.pivot(index="patientId", columns="scalarId", values="value")

# Check the result
print("\nPivoted scalar table (first rows):")
display(df_scalars.head())

# Create a column for the response to the treatment
df_scalars["responder"] = df_scalars["tumorBurdenChangeFromBaseline.tend"].apply(
    lambda x: x <= -95
)
print("\nResponse variable (first rows):")
display(df_scalars[["tumorBurdenChangeFromBaseline.tend", "responder"]].head())

# Check if there are NaN values in the table
nan_rows = df_scalars.isna().any(axis=1)
id_to_remove = nan_rows[nan_rows].index.values
print(
    "\n",
    len(id_to_remove),
    "patient(s) containing NaN values in the table will be removed:",
)
display(df_scalars[df_scalars.index.isin(id_to_remove)])

# Remove corresponding row(s)
df_scalars = df_scalars.drop(index=id_to_remove)

# Step 3 : Logistic regression


In [None]:
featureCols = [
    "Blood.Drug.max",
    "initialTumorBurden.tmin",
    "bloodFlowRate.tmin",
    "initialCountCancerCells.tmin",
    "lymphDrainingRate.tmin",
    "vmaxCancerCellDeath.tmin",
    "lymphaticFlowRate.tmin",
]

X = df_scalars[featureCols]  # Features
y = df_scalars.responder  # Target variable

# Print the number of samples in each class
counts = df_scalars["responder"].value_counts()
print("Number of samples in each class:")
for label, count in counts.items():
    print(f"{label}: {count}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=16
)

# Create a pipeline that fits a logistic regression model
model = make_pipeline(LogisticRegression(random_state=16))

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Print the confusion matrix and classification report
confusion_matrix_result = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix:")
print(confusion_matrix_result)
print("\nClassification report:")
print(
    classification_report(
        y_test, y_pred, target_names=["Non-responder", "Responder"], zero_division=0
    )
)

# Produce a ROC curve
y_pred_prob = model.predict_proba(X_test)[::, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"data 1, auc = {auc:.3f}"))
fig.update_layout(
    title="ROC Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1]),
    margin=dict(b=50, t=50, l=50, r=50),
    width=500,
    height=500,
)
fig.add_annotation(
    text=f"AUC = {auc:.3f}",
    x=1,
    y=0,
    xref="paper",
    yref="paper",
    showarrow=False,
    font=dict(size=14),
)
fig.show()