# Data Exploration: Time Spans, Splits, and Interpolation

This notebook explores:
1. Daily and hourly data loading
2. Time spans for each sensor before and after train/test/val split
3. Outlier detection examples
4. Data interpolation process (before and after)

## Setup and Imports

In [38]:
import sys
sys.path.insert(0, '..')

from src.energy_forecast.config import (PROCESSED_DATA_DIR, INTERIM_DATA_DIR, META_DIR, REPORTS_DIR)
from src.energy_forecast.dataset import Dataset, InterpolatedDataset, TrainDatasetBuilding
from src.energy_forecast.utils.train_test_val_split import get_train_test_val_split

import polars as pl
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Load Processed Daily and Hourly Data

In [39]:
# Load interpolated and feature-engineered datasets
dataset_hourly_csv = PROCESSED_DATA_DIR / "dataset_interpolate_hourly_feat.csv"
df_hourly = pl.read_csv(dataset_hourly_csv)
print(f"Hourly data shape: {df_hourly.shape}")
print(f"Unique sensors (hourly): {df_hourly['id'].n_unique()}")

dataset_daily_csv = PROCESSED_DATA_DIR / "dataset_interpolate_daily_feat.csv"
df_daily = pl.read_csv(dataset_daily_csv)
print(f"\nDaily data shape: {df_daily.shape}")
print(f"Unique sensors (daily): {df_daily['id'].n_unique()}")

Hourly data shape: (834313, 27)
Unique sensors (hourly): 1125

Daily data shape: (103999, 29)
Unique sensors (daily): 361


## Time Spans for Each Sensor (Before Split)

Visualize the temporal coverage of each sensor in the dataset before any train/test/val splitting.

In [40]:
# Hourly time spans
df_min_max_dates_hourly = df_hourly.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("num_records")
     ]
)

fig = go.Figure()
for (start, end, sensor_id, num_records) in zip(
    df_min_max_dates_hourly["min_date"], 
    df_min_max_dates_hourly["max_date"],
    df_min_max_dates_hourly["id"], 
    df_min_max_dates_hourly["num_records"]
):
    fig.add_trace(go.Scatter(
        x=[start, end], 
        y=[sensor_id, sensor_id],
        mode='lines',
        name=f"{num_records} records",
        hovertemplate=f"<b>{sensor_id}</b><br>Start: {start}<br>End: {end}<br>Records: {num_records}"
    ))

fig.update_layout(
    title="Hourly Data: Time Spans per Sensor (Before Split)",
    title_x=0.5,
    xaxis_title="Date",
    yaxis_title="Sensor ID",
    showlegend=False,
    template="plotly",
    height=800
)
fig.update_yaxes(visible=True, showticklabels=False)
fig.show()

In [41]:
# Daily time spans
df_min_max_dates_daily = df_daily.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("num_records")
     ]
)

fig = go.Figure()
for (start, end, sensor_id, num_records) in zip(
    df_min_max_dates_daily["min_date"], 
    df_min_max_dates_daily["max_date"],
    df_min_max_dates_daily["id"], 
    df_min_max_dates_daily["num_records"]
):
    fig.add_trace(go.Scatter(
        x=[start, end], 
        y=[sensor_id, sensor_id],
        mode='lines',
        name=f"{num_records} records",
        hovertemplate=f"<b>{sensor_id}</b><br>Start: {start}<br>End: {end}<br>Records: {num_records}"
    ))

fig.update_layout(
    title="Daily Data: Time Spans per Sensor (Before Split)",
    title_x=0.5,
    xaxis_title="Date",
    yaxis_title="Sensor ID",
    showlegend=False,
    template="plotly",
    height=800
)
fig.update_yaxes(visible=True, showticklabels=False)
fig.show()

## Train/Test/Val Split

Now we'll perform the time-based train/test/val split and see how many sensors remain after filtering out short series.

In [42]:
# Daily split
config_daily = {
    "dataset": "building",
    "res": "daily",
    "interpolate": 1,
    "lag_in": 7,
    "lag_out": 7,
    "n_in": 7,
    "n_out": 7,
    "energy": "all",
    "train_test_split_method": "time",
    "scale_mode": "individual",
    "scaler": "standard",
    "feature_code": 10
}

print("Loading and splitting daily dataset...")
ds_daily = TrainDatasetBuilding(config_daily)
ds_daily.load_feat_data(interpolate=True)
ds_daily.preprocess()
ds_daily = get_train_test_val_split(ds_daily)

print(f"\nDaily split results:")
n = df_daily['id'].n_unique()
print(f"Original sensors: {n}")
len_discarded_ids = len(ds_daily.discarded_ids)
print(f"Discarded series (too short): {len_discarded_ids}")
len_no_heated_area = len(df_daily.filter(pl.col('heated_area').is_null()).group_by(pl.col('id')).len())
print(f"Discarded series (no building feature): {len_no_heated_area}")
len_remaining_ids = ds_daily.df['id'].n_unique() - len(ds_daily.discarded_ids)
print(f"Discarded series (other reason): {n-(len_discarded_ids+len_no_heated_area+len_remaining_ids)}")
print(f"Remaining series: {len_remaining_ids}")
print(f"Train samples: {len(ds_daily.X_train)}")
print(f"Val samples: {len(ds_daily.X_val)}")
print(f"Test samples: {len(ds_daily.X_test)}")

Loading and splitting daily dataset...
[32m2026-01-01 18:03:47.146[0m | [1mINFO    [0m | [36msrc.energy_forecast.dataset[0m:[36mpreprocess[0m:[36m606[0m - [1mTraining Features: ['hum_max', 'tsun', 'tavg', 'hum_min', 'hum_avg', 'typ_2', 'primary_energy_gas', 'heated_area', 'weekend', 'prcp', 'diff', 'tmin', 'wdir', 'primary_energy_district heating', 'pres', 'typ_0', 'tmax', 'typ_4', 'daily_avg', 'typ_1', 'wpgt', 'wspd', 'holiday', 'day_of_month_sin', 'weekday_sin', 'day_of_month_cos', 'weekday_cos'][0m
[32m2026-01-01 18:03:47.367[0m | [1mINFO    [0m | [36msrc.energy_forecast.utils.train_test_val_split[0m:[36mtrain_test_split_time_based[0m:[36m85[0m - [1mRemoved 40 series because they were too short[0m
[32m2026-01-01 18:03:47.367[0m | [1mINFO    [0m | [36msrc.energy_forecast.utils.train_test_val_split[0m:[36mtrain_test_split_time_based[0m:[36m87[0m - [1mRemaining series: 92[0m
[32m2026-01-01 18:03:47.429[0m | [1mINFO    [0m | [36msrc.energy_foreca

In [43]:
len(df_hourly.filter(pl.col("heated_area").is_null()).group_by(pl.col("id")).len())

833

In [44]:
df_hourly.filter(.group_by(pl.col("id")).len()

SyntaxError: invalid syntax (3099231088.py, line 1)

In [None]:
# Hourly split
config_hourly = {
    "dataset": "building",
    "res": "hourly",
    "interpolate": 1,
    "lag_in": 72,
    "lag_out": 72,
    "n_in": 72,
    "n_out": 72,
    "energy": "all",
    "train_test_split_method": "time",
    "scale_mode": "individual",
    "scaler": "standard",
    "feature_code": 15
}

print("Loading and splitting hourly dataset...")
ds_hourly = TrainDatasetBuilding(config_hourly)
ds_hourly.load_feat_data(interpolate=True)
ds_hourly.preprocess()
ds_hourly = get_train_test_val_split(ds_hourly)

print(f"\nHourly split results:")
n = df_hourly['id'].n_unique()
print(f"Original sensors: {n}")
len_discarded_ids = len(ds_hourly.discarded_ids)
print(f"Discarded series (too short): {len_discarded_ids}")
len_no_heated_area = len(df_hourly.filter(pl.col('heated_area').is_null()).group_by(pl.col('id')).len())
print(f"Discarded series (no building feature): {len_no_heated_area}")
len_remaining_ids = ds_hourly.df['id'].n_unique() - len(ds_hourly.discarded_ids)
print(f"Discarded series (other reason): {n-(len_discarded_ids+len_no_heated_area+len_remaining_ids)}")
print(f"Remaining series: {len_remaining_ids}")
print(f"Train samples: {len(ds_hourly.X_train)}")
print(f"Val samples: {len(ds_hourly.X_val)}")
print(f"Test samples: {len(ds_hourly.X_test)}")

## Time Spans After Split

Visualize the time spans for sensors that remain after filtering and splitting.

In [None]:
# Get dataframes after split (excluding discarded series)
df_daily_after = ds_daily.df.filter(~pl.col("id").is_in(ds_daily.discarded_ids))

df_min_max_dates_daily_after = df_daily_after.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("num_records")
     ]
)

fig = go.Figure()
for (start, end, sensor_id, num_records) in zip(
    df_min_max_dates_daily_after["min_date"], 
    df_min_max_dates_daily_after["max_date"],
    df_min_max_dates_daily_after["id"], 
    df_min_max_dates_daily_after["num_records"]
):
    fig.add_trace(go.Scatter(
        x=[start, end], 
        y=[sensor_id, sensor_id],
        mode='lines',
        name=f"{num_records} records",
        hovertemplate=f"<b>{sensor_id}</b><br>Start: {start}<br>End: {end}<br>Records: {num_records}"
    ))

fig.update_layout(
    title=f"Daily Data: Time Spans per Sensor (After Split, {len(df_min_max_dates_daily_after)} sensors)",
    title_x=0.5,
    xaxis_title="Date",
    yaxis_title="Sensor ID",
    showlegend=False,
    template="plotly",
    height=800
)
fig.update_yaxes(visible=True, showticklabels=False)
fig.show()

print(f"\nSensors before split: {df_daily['id'].n_unique()}")
print(f"Sensors after split: {len(df_min_max_dates_daily_after)}")

In [None]:
# Get dataframes after split (excluding discarded series)
df_hourly_after = ds_hourly.df.filter(~pl.col("id").is_in(ds_hourly.discarded_ids))

df_min_max_dates_hourly_after = df_hourly_after.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("num_records")
     ]
)

fig = go.Figure()
for (start, end, sensor_id, num_records) in zip(
    df_min_max_dates_hourly_after["min_date"], 
    df_min_max_dates_hourly_after["max_date"],
    df_min_max_dates_hourly_after["id"], 
    df_min_max_dates_hourly_after["num_records"]
):
    fig.add_trace(go.Scatter(
        x=[start, end], 
        y=[sensor_id, sensor_id],
        mode='lines',
        name=f"{num_records} records",
        hovertemplate=f"<b>{sensor_id}</b><br>Start: {start}<br>End: {end}<br>Records: {num_records}"
    ))

fig.update_layout(
    title=f"Hourly Data: Time Spans per Sensor (After Split, {len(df_min_max_dates_hourly_after)} sensors)",
    title_x=0.5,
    xaxis_title="Date",
    yaxis_title="Sensor ID",
    showlegend=False,
    template="plotly",
    height=800
)
fig.update_yaxes(visible=True, showticklabels=False)
fig.show()

print(f"\nSensors before split: {df_hourly['id'].n_unique()}")
print(f"Sensors after split: {len(df_min_max_dates_hourly_after)}")

## Outlier Detection Example

Demonstrate outlier detection using IQR method on a sample sensor.

In [None]:
# Create a raw daily dataset to show outliers
ds_raw = Dataset(res="daily")
ds_raw.create()

# Pick a sensor with known outliers
sensor_id = "a0bb40c4-7d73-4c5d-91bd-d82fe748a75d.74820f614mer"
df_sensor = ds_raw.df.filter(pl.col("id") == sensor_id)
df_sensor = df_sensor.select(["id", "datetime", "diff"])

print(f"Sensor: {sensor_id}")
print(f"Records: {len(df_sensor)}")
print(f"Date range: {df_sensor['datetime'].min()} to {df_sensor['datetime'].max()}")
print(f"\nDiff statistics:")
print(df_sensor['diff'].describe())

In [None]:
# Plot raw data
fig = px.scatter(df_sensor, x="datetime", y="diff", title=f"Sensor {sensor_id}: Raw Data")
fig.update_layout(template="plotly")
fig.show()

In [None]:
# Apply IQR outlier detection
column = "diff"
q25 = df_sensor[column].quantile(0.25)
q75 = df_sensor[column].quantile(0.75)
iqr = q75 - q25

upper_bound = q75 + 1.5 * iqr
lower_bound = q25 - 1.5 * iqr

print(f"Q25: {q25:.2f}")
print(f"Q75: {q75:.2f}")
print(f"IQR: {iqr:.2f}")
print(f"Lower bound: {lower_bound:.2f}")
print(f"Upper bound: {upper_bound:.2f}")

# Mark outliers
df_with_outliers = df_sensor.with_columns(
    ((pl.col(column) > upper_bound) | (pl.col(column) < lower_bound)).alias("outlier")
)

num_outliers = df_with_outliers.filter(pl.col("outlier")).height
print(f"\nOutliers detected: {num_outliers} ({num_outliers/len(df_sensor)*100:.1f}%)")

# Plot with outliers marked
fig = px.scatter(df_with_outliers, x="datetime", y="diff", color="outlier",
                 title=f"Sensor {sensor_id}: Outlier Detection (IQR Method)")
fig.update_layout(template="plotly")
fig.show()

## Interpolation Process: Before and After

Demonstrate the data cleaning and interpolation process using the `InterpolatedDataset.clean()` method.

In [None]:
# Load raw daily data (before interpolation)
print("Creating raw daily dataset (before interpolation)...")
ds_daily_raw = Dataset(res="daily")
ds_daily_raw.create()

print(f"\nRaw daily dataset:")
print(f"Total records: {len(ds_daily_raw.df)}")
print(f"Unique sensors: {ds_daily_raw.df['id'].n_unique()}")
print(f"Date range: {ds_daily_raw.df['datetime'].min()} to {ds_daily_raw.df['datetime'].max()}")

In [None]:
# Apply cleaning (removes outliers, negative values, etc.)
print("Cleaning raw daily data (removes outliers, negatives, flat lines, connection errors)...")
ds_daily_raw.clean(plot=False)

print(f"\nAfter cleaning:")
print(f"Total records: {len(ds_daily_raw.df)}")
print(f"Unique sensors: {ds_daily_raw.df['id'].n_unique()}")
print(f"Records removed: {len(ds_daily_raw.df) - len(ds_daily_raw.df)}")

In [None]:
# Now create interpolated dataset
print("Creating interpolated daily dataset...")
ds_daily_interpolated = InterpolatedDataset(res="daily")
ds_daily_interpolated.create()
ds_daily_interpolated.clean(plot=False)

print(f"\nAfter interpolation:")
print(f"Total records: {len(ds_daily_interpolated.df)}")
print(f"Unique sensors/series: {ds_daily_interpolated.df['id'].n_unique()}")
print(f"Records added: {len(ds_daily_interpolated.df) - len(ds_daily_raw.df)}")

### Visualize Interpolation: Before and After

Compare a sensor's data before and after interpolation.

In [None]:
# Select a sensor that exists in both datasets
common_sensors = set(ds_daily_raw.df['id'].unique()) & set(ds_daily_interpolated.df['id'].unique())
sample_sensor = list(common_sensors)[0]

print(f"Analyzing sensor: {sample_sensor}")

# Get data before and after interpolation
df_before = ds_daily_raw.df.filter(pl.col("id") == sample_sensor).sort("datetime")
df_after = ds_daily_interpolated.df.filter(pl.col("id") == sample_sensor).sort("datetime")

print(f"\nBefore interpolation:")
print(f"  Records: {len(df_before)}")
print(f"  Date range: {df_before['datetime'].min()} to {df_before['datetime'].max()}")

print(f"\nAfter interpolation:")
print(f"  Records: {len(df_after)}")
print(f"  Date range: {df_after['datetime'].min()} to {df_after['datetime'].max()}")
print(f"\nRecords added by interpolation: {len(df_after) - len(df_before)}")

In [None]:
# Plot before and after side by side
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Before Interpolation", "After Interpolation"),
    vertical_spacing=0.12
)

# Before interpolation
fig.add_trace(
    go.Scatter(x=df_before['datetime'], y=df_before['diff'], 
               mode='lines+markers', name='Before',
               line=dict(color='blue')),
    row=1, col=1
)

# After interpolation
fig.add_trace(
    go.Scatter(x=df_after['datetime'], y=df_after['diff'], 
               mode='lines+markers', name='After',
               line=dict(color='green')),
    row=2, col=1
)

fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="Consumption (diff)", row=1, col=1)
fig.update_yaxes(title_text="Consumption (diff)", row=2, col=1)

fig.update_layout(
    title_text=f"Daily Data Interpolation: {sample_sensor}",
    template="plotly",
    height=800,
    showlegend=True
)

fig.show()

### Hourly Data Interpolation

In [None]:
# Create raw and interpolated hourly datasets
print("Creating raw hourly dataset...")
ds_hourly_raw = Dataset(res="hourly")
ds_hourly_raw.create()
ds_hourly_raw.clean(plot=False)

print(f"\nRaw hourly dataset (after basic cleaning):")
print(f"Total records: {len(ds_hourly_raw.df)}")
print(f"Unique sensors: {ds_hourly_raw.df['id'].n_unique()}")

print("\nCreating interpolated hourly dataset...")
ds_hourly_interpolated = InterpolatedDataset(res="hourly")
ds_hourly_interpolated.create()
ds_hourly_interpolated.clean(plot=False)

print(f"\nInterpolated hourly dataset:")
print(f"Total records: {len(ds_hourly_interpolated.df)}")
print(f"Unique sensors/series: {ds_hourly_interpolated.df['id'].n_unique()}")
print(f"Records added: {len(ds_hourly_interpolated.df) - len(ds_hourly_raw.df)}")

In [None]:
# Select a sensor for hourly visualization
common_sensors_hourly = set(ds_hourly_raw.df['id'].unique()) & set(ds_hourly_interpolated.df['id'].unique())
sample_sensor_hourly = list(common_sensors_hourly)[0]

print(f"Analyzing hourly sensor: {sample_sensor_hourly}")

# Get a time window to visualize (e.g., 1 month)
df_h_before = ds_hourly_raw.df.filter(pl.col("id") == sample_sensor_hourly).sort("datetime")
df_h_after = ds_hourly_interpolated.df.filter(pl.col("id") == sample_sensor_hourly).sort("datetime")

# Take a 2-week window for clearer visualization
start_date = df_h_before['datetime'].min()
end_date = start_date + pl.duration(days=14)

df_h_before_window = df_h_before.filter(
    (pl.col("datetime") >= start_date) & (pl.col("datetime") <= end_date)
)
df_h_after_window = df_h_after.filter(
    (pl.col("datetime") >= start_date) & (pl.col("datetime") <= end_date)
)

print(f"\nVisualization window: {start_date} to {end_date}")
print(f"Before: {len(df_h_before_window)} records")
print(f"After: {len(df_h_after_window)} records")

In [None]:
# Plot hourly data before and after
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Before Interpolation (2-week window)", "After Interpolation (2-week window)"),
    vertical_spacing=0.12
)

# Before interpolation
fig.add_trace(
    go.Scatter(x=df_h_before_window['datetime'], y=df_h_before_window['diff'], 
               mode='lines+markers', name='Before',
               line=dict(color='blue')),
    row=1, col=1
)

# After interpolation
fig.add_trace(
    go.Scatter(x=df_h_after_window['datetime'], y=df_h_after_window['diff'], 
               mode='lines+markers', name='After',
               line=dict(color='green')),
    row=2, col=1
)

fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="Consumption (diff)", row=1, col=1)
fig.update_yaxes(title_text="Consumption (diff)", row=2, col=1)

fig.update_layout(
    title_text=f"Hourly Data Interpolation: {sample_sensor_hourly}",
    template="plotly",
    height=800,
    showlegend=True
)

fig.show()

## Summary

This notebook demonstrated:
1. **Time span analysis** showing the temporal coverage of sensors before and after train/test/val splitting
2. **Data splitting** process that filters out short series
3. **Outlier detection** using the IQR method
4. **Data interpolation** process that fills missing timestamps in the time series

Key findings:
- The train/test/val split removes sensors with insufficient data
- Interpolation significantly increases the number of records by filling gaps
- The cleaning process (outlier removal, negative value filtering) ensures data quality before interpolation