# Visualizing Kaplan-Meier curves (survival analysis)

## Introduction

This cookbook will guide you through the creation of a simple visualization of a Kaplan Meier curve from an existing trial in jinko.  
In particular, you will be able to retrieve time series and plot them using plotly.  


Linked resources: [Jinko](https://jinko.ai/project/e0fbb5bb-8929-439a-bad6-9e12d19d9ae4?labels=24574ece-6bde-4d76-896a-187426965a51).

In [None]:
# Jinko specifics imports & initialization
# Please fold this section and do not change
import jinko_helpers as jinko

# This function ensures that authentication is correct
# It it also possible to override the base url by passing baseUrl=...
# If everything is well setup, it should print "Authentication successful"
jinko.initialize()

In [None]:
# Cookbook specifics imports

import io
import pandas as pd
import zipfile
from typing import List, Any
from sksurv.nonparametric import kaplan_meier_estimator
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt

## Fill with informations on what you want to see

In [None]:
# Cookbook specifics constants

# Fill the short Id of your Trial
trialId = "tr-gLnd-8yYx"

# Fill the Id of the biomarkers you want to retrieve
biomarkersId = ["timeToClinicalProgression-at-P2Y"]

# Defined your custom label names for your arms
custom_labels = {"ArmA": "Treated", "ArmB": "Treated_DoubleDose"}

# if you want to compare results with data.
compare_with_data = True

## Let's use the API and plot the data

### Load your data
If you have data, you can load them here. If you don't, you can skip this cell. There is a dummy example of what datas should look like for this script.

In [None]:
# Time at which a patient is censored or encounter the event
durationDataArmA = [20, 40, 80, 120, 120, 160, 180, 300, 380, 500, 600]
durationDataArmB = [20, 40, 100, 100, 140, 160, 180, 220, 300, 390, 500, 600]


# Status of the patients: True: event occurred, False: censor. All patients that did not encounter the event are censored
statusDataArmA = [True, True, True, False, True, True, False, True, True, True, False]
statusDataArmB = [
    True,
    True,
    True,
    False,
    True,
    True,
    True,
    False,
    True,
    True,
    True,
    False,
]

### Visualization of your trial (with data if any)

In [None]:
# Load your results and retrieve your trial information
## Convert short Id to coreItemId
coreItemId = jinko.get_core_item_id(trialId, 1)
# Get the last version of your Trial
## List all Trial versions
versions: List[Any] = jinko.make_request(
    f"/core/v2/trial_manager/trial/{coreItemId['id']}/status"
).json()
## Get the latest completed version
latestCompletedVersion = next(
    (item for item in versions if item["status"] == "completed"), None
)
if latestCompletedVersion is None:
    raise Exception("No completed Trial version found")

dfBiomarkers_raw = jinko.get_trial_scalars_as_dataframe(
    latestCompletedVersion["simulationId"]["coreItemId"], latestCompletedVersion["simulationId"]["snapshotId"], scalar_ids=biomarkersId
)
# Print the first 5 rows of dfBiomarkers_raw to check that everything is alright
print("The first rows of your data looks like: ", dfBiomarkers_raw.head(5))

In [None]:
# Format your trial results
## Assuming dfBiomarkers_raw is your DataFrame
df = dfBiomarkers_raw
## Convert the `armId` column to two groups (e.g., ArmA and ArmB)
df["armId"] = df["armId"].map(lambda x: x.split("_")[0])
# Split the trial results by arm
grouped = df.groupby("armId")

# Create Kaplan-Meier curves for each arm
plt.figure(figsize=(10, 6))

for arm, group in grouped:
    # Adapt the results for Kaplan-Meier estimator
    time = group["value"].values
    event_occurred = [True] * len(time)  # Assuming all events are uncensored
    max_time = max(time)
    event_occurred = [False if t == max_time else True for t in time]

    # Calculate Kaplan-Meier estimates
    timeSimulation, survivalSimulation, conf_intSimulation = kaplan_meier_estimator(
        event_occurred, time, conf_type="log-log"
    )

    # Plot the survival curves with custom labels
    plt.step(
        timeSimulation,
        survivalSimulation,
        where="post",
        label=custom_labels.get(arm, arm),
    )
    plt.fill_between(
        timeSimulation,
        conf_intSimulation[0, :],
        conf_intSimulation[1, :],
        alpha=0.25,
        step="post",
    )


## Plot the extracted data points, only if compare_with_data is True

if compare_with_data:
    timeDataArmA, survivalDataArmA = kaplan_meier_estimator(
        statusDataArmA, durationDataArmA
    )
    timeDataArmB, survivalDataArmB = kaplan_meier_estimator(
        statusDataArmB, durationDataArmB
    )
    plt.plot(
        timeDataArmA,
        survivalDataArmA,
        linestyle="--",
        marker="o",
        color="blue",
        markersize=1,
        label="Extracted Data for " + str(custom_labels["ArmA"]),
    )
    plt.plot(
        timeDataArmB,
        survivalDataArmB,
        linestyle="--",
        marker="o",
        color="orange",
        markersize=1,
        label="Extracted Data for " + str(custom_labels["ArmB"]),
    )

# Customize plot
plt.ylabel("Survival probability")
plt.xlabel("Time (months)")
plt.title("Kaplan-Meier Curves by Treatment Arm with Extracted Data")
plt.legend(title="Treatment Arm")
plt.grid(True)
plt.show()

### Compare with data
Compare the result of the simulation with your data using a log rank test. Example for ArmA

In [None]:
# For ArmA
## Format trial results
arm = custom_labels["ArmA"]
group = grouped.get_group(arm)
time = group["value"].values
event_occurred = [True] * len(time)  # Assuming all events are uncensored
max_time = max(time)
event_occurred = [False if t == max_time else True for t in time]

# Kaplan-Meier comparison
## Calculate Kaplan-Meier estimates
timeSimulation, survivalSimulation, conf_int = kaplan_meier_estimator(
    event_occurred, time, conf_type="log-log"
)
timeSimulationData, survivalSimulationData = kaplan_meier_estimator(
    statusDataArmA, durationDataArmA
)
## Compare with data
results = logrank_test(
    time,
    durationDataArmA,
    event_observed_A=event_occurred,
    event_observed_B=statusDataArmA,
)
print(
    "The logrank test for the arm ",
    arm,
    " compare with data, is: p_value = ",
    results.p_value,
)
print("Log-rank test p-value:", results.p_value)

# Plot both curves
plt.step(
    timeSimulation, survivalSimulation, where="post", label=custom_labels.get(arm, arm)
)
plt.fill_between(
    timeSimulation, conf_int[0, :], conf_int[1, :], alpha=0.25, step="post"
)
plt.plot(
    timeSimulationData,
    survivalSimulationData,
    linestyle="--",
    marker="o",
    color="blue",
    markersize=1,
    label="Extracted Data for " + str(custom_labels["ArmA"]),
)
# Customize plot
plt.ylabel("Survival probability")
plt.xlabel("Time (months)")
plt.title("Kaplan-Meier Curves by Treatment Arm with Extracted Data")
plt.legend(title="Treatment Arm")
plt.grid(True)
plt.show()