In [None]:
import pandas as pd

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

In [None]:
# Load train data

readings_df = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
)

readings_df["timestamp"] = pd.to_datetime(readings_df["timestamp"])

In [None]:
validation_periods = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]


def yield_train_test_split(train_df: pd.DataFrame) -> pd.DataFrame:
    for val_start, val_end in validation_periods:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]

In [None]:
train_df_counts = []
test_df_counts = []
for i, (train_df, test_df) in enumerate(yield_train_test_split(readings_df)):
    building_meter_counts_train = (
        train_df
        .groupby(["building_id", "meter_id"])["timestamp"]
        .nunique()
        .rename(f"split_{i}")
    )
    train_df_counts.append(building_meter_counts_train)
    
    building_meter_counts_test = (
        test_df
        .groupby(["building_id", "meter_id"])["timestamp"]
        .nunique()
        .rename(f"split_{i}")
    )
    test_df_counts.append(building_meter_counts_test)

In [None]:
pd.concat(test_df_counts, axis=1).plot()