# Flat Representations Tutorial

In [1]:
import sys

sys.path.append("../")

In [2]:
# Imports
import os
from pathlib import Path
import polars as pl, numpy as np
import polars.selectors as cs

from EventStream.data.dataset_polars import Dataset
from EventStream.evaluation.FT_task_baseline import load_flat_rep, fit_baseline_task_model

from sklearn.ensemble import RandomForestClassifier

In [3]:
ESD = Dataset.load(Path(os.getcwd()) / "processed/sample")

In [4]:
ESD.cache_flat_representation(
    subjects_per_output_file=None,
    feature_inclusion_frequency=None,
    do_overwrite=False,
    do_update=True,
)

Flattening Splits:   0%|          | 0/3 [00:00<?, ?it/s]

Subject chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Subject chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Subject chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
flat_reps = load_flat_rep(ESD, window_sizes=['7d', '30d', 'FULL'])
display(flat_reps['train'].head().collect())

In [None]:
task_df = (
    ESD.events_df
    .groupby('subject_id')
    .agg(pl.col('timestamp').sample().first().alias('end_time'))
    .with_columns(
        pl.lit(np.random.choice([0, 1, 2], size=len(ESD.subject_ids))).alias('label'),
        pl.lit(None, dtype=pl.Datetime).alias('start_time')
    )
).lazy()

In [None]:
out = fit_baseline_task_model(
    task_df, 'label', ESD, n_samples=3, 
    model_cls = RandomForestClassifier,
    model_param_distributions={'min_samples_split': [2, 10, 100]},
    verbose=20,
    hyperparameter_search_budget=2,
    error_score='raise',
    window_size_options=['7d', '30d', 'FULL'],
    seed=1,
)

In [None]:
out.best_params_

In [None]:
out.best_estimator_