In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

In [None]:
# Load meter readings
readings_df = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
)
readings_df["timestamp"] = pd.to_datetime(readings_df["timestamp"])
readings_df["ln_meter_reading"] = np.log1p(readings_df["meter_reading"])

## Electricity Readings

In [None]:
# meter_id: type -> {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}

meter_id = 0
start_id = 1445
building_ids = range(start_id, start_id + 10)

df = readings_df[(readings_df["building_id"].isin(building_ids)) & (readings_df["meter_id"] == meter_id)]

fig, ax = plt.subplots(1, 1, figsize=(20, 6))
for b_id in building_ids:
    b_df = df[df["building_id"] == b_id]
    ax.plot(
        b_df["timestamp"].values,
        b_df["ln_meter_reading"].values,
        label=f"Building id {b_id}",
    )

ax.legend()

### Filter on timestamps

In [None]:
# Below is all data to keep to by building id
# If a building id is missing -> keep all data
# These are start and end periods of data to keep, bounds are inclusive

after_may_21_filter_electricity = {
    i: [(datetime(2016, 5, 21), None)]
    for i in list(range(29)) + list(range(30, 45)) + list(range(47, 53)) + list(range(54, 105))
}

building_id_filters_electriciy = {
    29: [(datetime(2016, 8, 10), None)],
    45: [(datetime(2016, 7, 1), None)],
    46: [(None, datetime(2016, 3, 1)), (datetime(2016, 5, 21), None)],
    53: [(datetime(2016, 12, 15), None)],                             # Only keeping the final timestamp
    740: [(datetime(2016, 12, 31), None)],                            # Only keeping the final timestamp
    803: [(None, datetime(2016, 9, 24))],
    857: [(None, datetime(2016, 4, 13))],
    1264: [(None, datetime(2016, 8, 23))],
    1345: [(None, datetime(2016, 2, 11))],
}

to_keep_electricity = after_may_21_filter_electricity | building_id_filters_electriciy

In [None]:
def construct_keep_filter(
    timestamps: pd.Series,
    to_keep: list[tuple[datetime | None, datetime | None]],
):
    filter_ = False 
    for start, end in to_keep:
    
        match (start, end):
            case None, None:
                print("Continue")
            case (datetime(), None):
                period_filter = timestamps >= start
            case (None, datetime()):
                period_filter = timestamps <= end
            case (datetime(), datetime()):
                period_filter = (timestamps >= start) & (timestamps <= end)
        
        filter_ |= period_filter
    
    return filter_


def to_keep_filter(
    data: pd.DataFrame,
    to_keep: list[tuple[datetime | None, datetime | None]]
):
    keep_filter = construct_keep_filter(data["timestamp"], to_keep)
    return data.loc[keep_filter]

In [None]:
# Test that filters are working

meter_id = 0
building_id = np.random.choice(a=list(to_keep_electricity.keys()))
building_id_filter = to_keep_electricity[building_id]
print(building_id, building_id_filter)

bm_df_before = readings_df[(readings_df["building_id"] == building_id) & (readings_df["meter_id"] == meter_id)].copy()
bm_df_after = to_keep_filter(bm_df_before, building_id_filter)


fig, ax = plt.subplots(1, 1, figsize=(15, 4))
ax.plot(
    bm_df_before["timestamp"].values,
    bm_df_before["ln_meter_reading"].values,
    label=f"Before filter"
)
ax.plot(
    bm_df_after["timestamp"].values,
    bm_df_after["ln_meter_reading"].values,
    label=f"After filter"
)
ax.legend();

In [None]:
# Apply to all electricity data

def filter_electricity_data(
    readings_df: pd.DataFrame,
    to_keep_filters: dict[int, list[tuple[datetime | None, datetime | None]]],
):
    meter_id = 0
    for b_id, b_filter in tqdm(to_keep_filters.items()):
        bm_df = readings_df[
            (readings_df["building_id"] == b_id)
            & (readings_df["meter_id"] == meter_id)
        ]
        bm_df_filtered = to_keep_filter(bm_df, b_filter)
        readings_df = readings_df.drop(bm_df.index)
        readings_df = pd.concat([readings_df, bm_df_filtered], axis=0)
    return readings_df

In [None]:
readings_before = readings_df.copy()
readings_after = filter_electricity_data(readings_df, to_keep_electricity)

In [None]:
meter_id = 0
building_id = np.random.choice(a=list(to_keep_electricity.keys()))
# building_id = np.random.choice(readings_before["building_id"].unique())  # Buildings without filter should be unchanged

bm_df_before = readings_before[
    (readings_before["building_id"] == building_id)
    & (readings_before["meter_id"] == meter_id)
]
bm_df_after = readings_after[
    (readings_after["building_id"] == building_id)
    & (readings_after["meter_id"] == meter_id)
]


fig, ax = plt.subplots(1, 1, figsize=(15, 4))
ax.plot(
    bm_df_before["timestamp"].values,
    bm_df_before["ln_meter_reading"].values,
    label=f"Before filter"
)
ax.plot(
    bm_df_after["timestamp"].values,
    bm_df_after["ln_meter_reading"].values,
    label=f"After filter"
)
ax.legend();

### Filter by removing periods of constant readings

In [None]:
def find_constant_streaks(
    df: pd.DataFrame,
    streak_length: int = 10,
    target_column: str = "ln_meter_reading"
):
    # Compute diffs on target col
    df = df.copy()
    df = df.sort_values("timestamp")
    df["target_col_diff"] = df[target_column].diff()
    
    # First find any periods of constant meter readings
    streaks = []
    current_streak_start = 0
    for idx, row in df.iterrows():
        if pd.isna(row["target_col_diff"]):
            continue
        
        elif row["target_col_diff"] == 0:
            # Start a new streak if not already a running streak
            current_streak_start = current_streak_start or idx - 1
        
        else:
            # Streak finished
            # Save if there is currently a running streak
            if current_streak_start is not None:
                streaks.append((current_streak_start, idx - 1))
    
            # Reset
            current_streak_start = None
            

    # Only keep streaks with length >= streak_length
    filtered_streaks = []
    for start, end in streaks:
        streak_df = df.loc[start: end]
        assert (streak_df["target_col_diff"].dropna() == 0).all(), print(start, end)
        if len(streak_df) >= streak_length:
            start_t = streak_df["timestamp"].min()
            end_t = streak_df["timestamp"].max()
            filtered_streaks.append((start_t, end_t))
    
    return filtered_streaks

In [None]:
# Find timestamps of constant meter readings

meter_id = 0
building_id = 1153 

bm_df = readings_df[(readings_df["building_id"] == building_id) & (readings_df["meter_id"] == meter_id)].copy()
constant_streaks = find_constant_streaks(bm_df, streak_length=25)

fig, ax = plt.subplots(1, 1, figsize=(15, 4))
ax.plot(
    bm_df["timestamp"].values,
    bm_df["ln_meter_reading"].values,
    label=f"Building id {building_id}"
)

ax.legend()

for start, end in constant_streaks:
    print(start.to_pydatetime(), end.to_pydatetime())
    ax.axvline(x=start, label=start, color="black", lw=2)
    ax.axvline(x=end, label=end, color="black", lw=2)

In [None]:
# The below need to be removed due to constant streaks
# Apply constant filtering function with streak length = 25
to_remove = [
    105,
    107 -> 127,  # Might not find streaks in all of these
    136 -> 155,
    177,
    245 -> 254,
    269,
    278,
    376,
    537,
    545,
    577,
    681,
    693,
    723,
    733,
    738,
    799,
    802,
    874,
    875 -> 884,
    886,
    897,
    905 -> 945,
    954 -> 996,
    1066,
    1079,
    1096,
    1098,
    1128,
    1154,
    1157,
    1160,
    1169,
    1177,
    1185,
    1202,
    1221,
    1225 -> 1324, # careful of 1227, 1281, 1314 maybe exclude those from filter
    1359,
]