In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
DATA_PATH = "/kaggle/input/predict-energy-behavior-of-prosumers"

# WEATHER_HISTORICAL = pd.read_csv(f"{DATA_PATH}/historical_weather.csv")
# WEATHER_FORECASTS = pd.read_csv(f"{DATA_PATH}/forecast_weather.csv")
# GAS_PRICES = pd.read_csv(f"{DATA_PATH}/gas_prices.csv")
# ELECTRICITY_PRICES = pd.read_csv(f"{DATA_PATH}/electricity_prices.csv")
# CLIENTS = pd.read_csv(f"{DATA_PATH}/client.csv")

# TRAIN DATA

- `county` - An ID code for the county.
- `is_business`- Boolean for whether or not the prosumer is a business.
- `product_type` - ID code with the following mapping of codes to contract types: {0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}.
- `target` - The consumption or production amount for the relevant segment for the hour. The segments are defined by the `county`, `is_business`, and `product_type`.
- `is_consumption` - Boolean for whether or not this row's target is consumption or production.
- `datetime` - The Estonian time in EET (UTC+2) / EEST (UTC+3). It describes the start of the 1-hour period on which target is given.
- `data_block_id` - All rows sharing the same data_block_id will be available at the same forecast time. This is a function of what information is available when forecasts are actually made, at 11 AM each morning. For example, if the forecast weather data_block_id for predictins made on October 31st is 100 then the historic weather data_block_id for October 31st will be 101 as the historic weather data is only actually available the next day.
- `row_id` - A unique identifier for the row.
- `prediction_unit_id` - A unique identifier for the county, is_business, and product_type combination. New prediction units can appear or disappear in the test set.

In [None]:
TRAIN_DATA = pd.read_csv(f"{DATA_PATH}/train.csv", parse_dates=["datetime"])

In [None]:
TRAIN_DATA["datetime"].min(), TRAIN_DATA["datetime"].max()

In [None]:
prediction_units = list(TRAIN_DATA["prediction_unit_id"].unique())
prediction_unit_to_segment = {}
for prediction_unit in prediction_units:
    unit_df = TRAIN_DATA[TRAIN_DATA["prediction_unit_id"] == prediction_unit]
    segment_df = unit_df[["county", "is_business", "product_type"]]
    segment = segment_df.value_counts()
    [idx] = segment.index.values
    prediction_unit_to_segment[prediction_unit] = idx

In [None]:
# Check the length of the timeseries for each prediction unit / segment
prediction_unit_to_timeseries_length = {}
for prediction_unit in prediction_units:
    unit_df = TRAIN_DATA[TRAIN_DATA["prediction_unit_id"] == prediction_unit]
    consumption_df = unit_df[unit_df["is_consumption"] == 1]
    production_df = unit_df[unit_df["is_consumption"] == 0]
    prediction_unit_to_timeseries_length[prediction_unit] = (len(consumption_df), len(production_df))

for unit, lengths in prediction_unit_to_timeseries_length.items():
    c_len, p_len = lengths
    if c_len != p_len:
        print(f"Unequal timeseries lengths for unit {unit}: ({c_len}, {p_len})")

In [None]:
# Check the NaN count for each prediction unit
prediction_unit_to_nan_count = {}
for prediction_unit in prediction_units:
    unit_df = TRAIN_DATA[TRAIN_DATA["prediction_unit_id"] == prediction_unit]
    consumption_df = unit_df[unit_df["is_consumption"] == 1]
    production_df = unit_df[unit_df["is_consumption"] == 0]
    c_nan = consumption_df["target"].isna().sum()
    p_nan = production_df["target"].isna().sum()
    prediction_unit_to_nan_count[prediction_unit] = (c_nan, p_nan)
    
for unit, counts in prediction_unit_to_nan_count.items():
    c_count, p_count = counts
    if c_count != p_count:
        print(f"Unequal nan counts for unit {unit}: ({c_count}, {p_count})")

In [None]:
# Check if timeseries matches an hourly frequency
...

In [None]:
def get_data_for_segment(segment_id, is_consumption):
    unit_df = TRAIN_DATA[TRAIN_DATA["prediction_unit_id"] == segment_id]
    unit_df = unit_df[unit_df["is_consumption"] == is_consumption].copy()
    return unit_df.sort_values("datetime")

In [None]:
# Plot all consumption / generation timeseries data
rows, cols = 5, 5  # total number of segments to plot
segment_id_offset = 50  # start from this segment id
start_time = pd.Timestamp("2022-10-01")
end_time = pd.Timestamp("2023-10-15")
is_consumption = 1  # Plot consumption (=1) or generation (=0)

fig, axes = plt.subplots(rows, cols, figsize=(3.5 * cols, 2 * rows), sharex=True)
axes = axes.flatten()
for i, ax in enumerate(axes):
    consumption_df = get_data_for_segment(i + segment_id_offset, is_consumption)
    consumption_df = consumption_df[(consumption_df["datetime"] >= start_time) & (consumption_df["datetime"] <= end_time)]
    ax.plot(
        consumption_df["datetime"].values,
        consumption_df["target"].values
    )
    for tick in ax.get_xticklabels():
        tick.set_rotation(45)
    
fig.tight_layout();