# Example for custom split and forecasting events

This notebook demonstrates how to adjust the splitters to split at custom events, as well to forecast different categories (rather than the default labs).

In [None]:
import pandas as pd

from twinweaver import (
    DataManager,
    Config,
    DataSplitterForecasting,
    DataSplitterEvents,
    ConverterInstruction,
    DataSplitter,
)

## Basic Setup


### Load Data

In [None]:
# Load data - generated example data
df_events = pd.read_csv("../../example_data/events.csv")
df_constant = pd.read_csv("../../example_data/constant.csv")
df_constant_description = pd.read_csv("../../example_data/constant_description.csv")

### Configuration and Data Manager

In [None]:
config = Config()  # Override values here to customize pipeline
config.constant_columns_to_use = [
    "birthyear",
    "gender",
    "histology",
    "smoking_history",
]  # Manually set from constant DF
config.constant_birthdate_column = "birthyear"

# <---------------------- IMPORTANT PARTS ---------------------------->


# To setup the different split events, we set this in the config
# In this example, we use genetic events as custom split events
config.split_event_category = "basic_biomarker"


# And to forecast different categories, we set this in the config as well
# In this example, lets say we want to forecast vitals (i.e. body weight in the example data)
config.event_category_forecast = ["vitals"]

In [None]:
# Setup the data manager
dm = DataManager(config=config)
dm.load_indication_data(df_events=df_events, df_constant=df_constant, df_constant_description=df_constant_description)
dm.process_indication_data()
dm.setup_unique_mapping_of_events()
dm.setup_dataset_splits()
dm.infer_var_types()

### Initialize Splitters and Converter

In [None]:
# This data splitter handles event prediction tasks
data_splitter_events = DataSplitterEvents(dm, config=config)
data_splitter_events.setup_variables()

# This data splitter handles forecasting tasks
data_splitter_forecasting = DataSplitterForecasting(
    data_manager=dm,
    config=config,
)
# If you don't want to do forecasting QA, proportional sampling, or 3-sigma filtering, you can skip this step
data_splitter_forecasting.setup_statistics()

# We will also use the easier interface that combines both data splitters
data_splitter = DataSplitter(data_splitter_events, data_splitter_forecasting)

# Set up the converter instruction
converter = ConverterInstruction(
    nr_tokens_budget_total=8192,
    config=config,
    dm=dm,
    variable_stats=data_splitter_forecasting.variable_stats,  # Optional, needed for forecasting QA tasks
)

## Examine patient data

From the data manager we can get the patient, for example this patientid.

In [None]:
patientid = dm.all_patientids[4]
patient_data = dm.get_patient_data(patientid)

## Convert patient data to string

### Generate Training Splits

In [None]:
forecasting_splits, events_splits, reference_dates = data_splitter.get_splits_from_patient_with_target(
    patient_data,
)

Now for each split, we can generate these strings. We just pick the first one as an example.

In [None]:
split_idx = 0
p_converted = converter.forward_conversion(
    forecasting_splits=forecasting_splits[split_idx],
    event_splits=[],  # For this example we only do forecasting, so we skip the event splits
    override_mode_to_select_forecasting="both",
)

In [None]:
forecasting_splits[0]

### Inspect the Output

In [None]:
print(p_converted["instruction"])

In [None]:
print(p_converted["answer"])

In [None]:
p_converted["answer"]

## Reverse Conversion: Text to Structured Data

In [None]:
date = reference_dates["date"][0]
return_list = converter.reverse_conversion(p_converted["answer"], dm, date)
return_list[0]["result"]