In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from tqdm import tqdm

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')
DATA_RESOLUTION = "1h"

WEATHER_FEATURE_COLUMNS = [
    'air_temperature',
    'cloud_coverage',
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed'
]
WEATHER_LAGS = [1, 2, 3, 4, 5]
WEATHER_ROLLING_WINDOWS = [12, 24]

In [None]:
def cast_readings_data(df: pd.DataFrame) -> pd.DataFrame:
    df["building_id"] = df["building_id"].astype("category")
    df["meter_id"] = df["meter_id"].astype("category")
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    try:
        df["meter_reading"] = df["meter_reading"].astype(np.float32)
    except KeyError:
        pass
    return df


def cast_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    df["site_id"] = df["site_id"].astype("category")
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    for col in WEATHER_FEATURE_COLUMNS:
        df[col] = df[col].astype(np.float32)
    return df


def cast_buildings_data(df: pd.DataFrame) -> pd.DataFrame:
    df["site_id"] = df["site_id"].astype("category")
    df["building_id"] = df["building_id"].astype("category")
    df["primary_use"] = df["primary_use"].astype("category")
    for col in ["square_feet", "year_built", "floor_count"]:
        df[col] = df[col].astype(np.float32)
    return df

## Train data

### Load raw data

In [None]:
# Meter readings
readings_df_train = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
)
readings_df_train = cast_readings_data(readings_df_train)

# Weather
weather_df_train = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")
weather_df_train = cast_weather_data(weather_df_train)

# Buildings
buildings_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")
buildings_df = cast_buildings_data(buildings_df)

### Filtering & Outlier Removal

In [None]:
def _find_constant_streaks(
    df: pd.DataFrame,
    streak_length: int = 25,
    target_column: str = "meter_reading"
):
    # Compute diffs on target col
    df = df.copy()
    df = df.sort_values("timestamp")
    df["target_col_diff"] = df[target_column].diff()
    
    # First find any periods of constant meter readings
    streaks = []
    current_streak_start = 0
    for idx, row in df.iterrows():
        if pd.isna(row["target_col_diff"]):
            continue
        
        elif row["target_col_diff"] == 0:
            # Start a new streak if not already a running streak
            current_streak_start = current_streak_start or idx - 1
        
        else:
            # Streak finished
            # Save if there is currently a running streak
            if current_streak_start is not None:
                streaks.append((current_streak_start, idx - 1))
    
            # Reset
            current_streak_start = None
            

    # Only keep streaks with length >= streak_length
    filtered_streaks = []
    for start, end in streaks:
        streak_df = df.loc[start: end]
        assert (streak_df["target_col_diff"].dropna() == 0).all(), print(start, end)
        if len(streak_df) >= streak_length:
            start_t = streak_df["timestamp"].min().to_pydatetime()
            end_t = streak_df["timestamp"].max().to_pydatetime()
            filtered_streaks.append((start_t, end_t))
    
    return filtered_streaks


def find_constant_streaks(
    readings_df: pd.DataFrame,
    meter_id: int,
    building_id: int,
    streak_length: int = 25,
    target_column: str = "meter_reading"
) -> list[tuple[datetime | None, datetime | None]]:
    bm_df = readings_df[
        (readings_df["building_id"] == building_id)
        & (readings_df["meter_id"] == meter_id)
    ]
    return _find_constant_streaks(bm_df, streak_length, target_column)

In [None]:
def construct_keep_filter(
    timestamps: pd.Series,
    to_keep: list[tuple[datetime | None, datetime | None]],
):
    filter_ = pd.Series(
        data=np.full(shape=(len(timestamps, )), fill_value=False),
        index=timestamps.index,
    )
    for start, end in to_keep:
    
        match (start, end):
            case None, None:
                pass
            case (datetime(), None):
                period_filter = timestamps >= start
            case (None, datetime()):
                period_filter = timestamps <= end
            case (datetime(), datetime()):
                period_filter = (timestamps >= start) & (timestamps <= end)
            case _:
                print(start, end)
                print("Unrecognised filter pattern. Skipping ...")
                continue
        
        filter_ |= period_filter
    
    return filter_


def keep_filter(
    data: pd.DataFrame,
    to_keep: list[tuple[datetime | None, datetime | None]]
):
    """
    Filter data to only include observations included in to_keep filters.
    Filters are tuples of datetimes specifying start and end timestamps
    (inclusive) of periods to be included.
    """
    keep_filter = construct_keep_filter(data["timestamp"], to_keep)
    return data.loc[keep_filter]


def remove_filter(
    data: pd.DataFrame,
    to_remove: list[tuple[datetime | None, datetime | None]]
):
    """
    Filter data by removing all observations included in the to_remove filter.
    Only observations outside of the filter will be retained in the final data.
    Filters are tuples of datetimes specifying start and end timestamps
    (inclusive) of periods to be removed.
    """
    remove_filter = construct_keep_filter(data["timestamp"], to_remove)
    return data.loc[~remove_filter]

In [None]:
def filter_readings_data(
    readings_df: pd.DataFrame,
    meter_id: int,
    to_keep_filters: dict[int, list[tuple[datetime | None, datetime | None]]] | None = None,
    to_remove_filters: dict[int, list[tuple[datetime | None, datetime | None]]] | None = None,
):
    to_keep_filters = to_keep_filters or {}
    to_remove_filters = to_remove_filters or {}
    
    to_drop_index_values = np.array([])
    filtered_bm_dfs = []

    # To keep filters
    for b_id, b_filter in tqdm(to_keep_filters.items()):
        bm_df = readings_df[
            (readings_df["building_id"] == b_id)
            & (readings_df["meter_id"] == meter_id)
        ]
        to_drop_index_values = np.concatenate([to_drop_index_values, bm_df.index])
        
        bm_df_filtered = keep_filter(bm_df, b_filter)
        filtered_bm_dfs.append(bm_df_filtered)

    # To remove filters
    for b_id, b_filter in tqdm(to_remove_filters.items()):
        bm_df = readings_df[
            (readings_df["building_id"] == b_id)
            & (readings_df["meter_id"] == meter_id)
        ]
        to_drop_index_values = np.concatenate([to_drop_index_values, bm_df.index])
        
        bm_df_filtered = remove_filter(bm_df, b_filter)
        filtered_bm_dfs.append(bm_df_filtered)
    
    readings_df = readings_df.drop(index=to_drop_index_values)
    filtered_bm_df = pd.concat(filtered_bm_dfs, axis=0)
    readings_df = pd.concat([readings_df, filtered_bm_df], axis=0)
    
    return readings_df

#### Electricity data

In [None]:
to_keep_electricity = {
    
    # Keep everything after May 21
    i: [(datetime(2016, 5, 21), None)]
    for i in (
        list(range(29)) 
        + list(range(30, 45)) 
        + list(range(47, 53)) 
        + list(range(54, 105))
    )
} | {
    # Some building specific filters
    29: [(datetime(2016, 8, 10), None)],
    45: [(datetime(2016, 7, 1), None)],
    53: [(datetime(2016, 12, 15), None)],
    106: [(datetime(2016, 11, 1), None)],
    180: [(None, datetime(2016, 2, 17, 10))],
    218: [(None, datetime(2016, 2, 17, 10))],
    604: [(datetime(2016, 12, 1), None)],
    740: [(datetime(2016, 12, 31), None)],
    803: [(None, datetime(2016, 9, 24))],
    857: [(None, datetime(2016, 4, 13))],
    1113: [(datetime(2016, 7, 27, 10), None)],
    1153: [(datetime(2016, 1, 20, 14), None)],
    1264: [(None, datetime(2016, 8, 23))],
    1345: [(None, datetime(2016, 2, 11))],
    46: [(None, datetime(2016, 3, 1)), (datetime(2016, 5, 21), None)],
}

to_remove_electricity = {
    b_id: find_constant_streaks(readings_df_train, meter_id=0, building_id=b_id)
    for b_id in tqdm(
        [105]
        + list(range(107, 128))
        + list(range(136, 156))
        + [177]
        + list(range(245, 255))
        + [269, 278, 376, 537, 545, 577, 681, 693, 723, 733, 738, 799, 802, 874 ]
        + list(range(875, 885))
        + [886, 897]
        + list(range(905, 946))
        + list(range(954, 997))
        + [1066, 1079, 1096, 1098, 1128, 1154, 1157, 1160, 1169, 1177, 1185, 1202 ]
        + [1221, 1225, 1226]
        + list(range(1228, 1281))
        + list(range(1282, 1314))
        + list(range(1315, 1325))
        + [1359]
    )
}

In [None]:
# Apply filters
readings_df_train = filter_readings_data(
    readings_df_train,
    meter_id=0,
    to_keep_filters=to_keep_electricity,
    to_remove_filters=to_remove_electricity,
)

#### Chilled Water

In [None]:
to_keep_chilled_water = {
    43: [(None, datetime(2016, 4, 4, 19)), (datetime(2016, 6, 6, 11), None)],
    60: [(datetime(2016, 4, 29, 10, 0), None)],
    162: [(None, datetime(2016, 9, 13, 14)), (datetime(2016, 10, 10, 8), None)],
    192: [(None, datetime(2016, 5, 9, 13))],
    195: [(None, datetime(2016, 3, 17, 12)), (datetime(2016, 3, 22), None)],
    236: [(None, datetime(2016, 1, 24, 2)), (datetime(2016, 3, 21, 12), None)],
    258: [(None, datetime(2016, 8, 29, 10)), (datetime(2016, 9, 8, 13), datetime(2016, 9, 19, 6)), (datetime(2016, 12, 12, 9,), None)],
    264: [(datetime(2016, 2, 8, 10), None)],
    290: [(None, datetime(2016, 8, 19, 1)), (datetime(2016, 9, 9, 7), datetime(2016, 9, 14, 17)), (datetime(2016, 9, 23, 1), datetime(2016, 10, 8, 14)), (datetime(2016, 10, 14, 9), None)],
    765: [(datetime(2016, 4, 22, 12), None)],
    778: [(datetime(2016, 9, 8, 9), datetime(2016, 10, 20))],
    780: [(None, datetime(2016, 8, 2))],
    
    # Filters for same / similar October pattern
    770: [(None, datetime(2016, 10, 4, 10, 0)), (datetime(2016, 10, 10, 11, 0), None)],
    777: [(None, datetime(2016, 10, 4, 9)), (datetime(2016, 10, 10, 8), None)],
    787: [(None, datetime(2016, 10, 4, 9)), (datetime(2016, 10, 10, 8), None)],

    # Filters for same July and October pattern
    880: [(None, datetime(2016, 3, 18, 23)), (datetime(2016, 5, 18, 10), datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 6), None)],
    954: [(datetime(2016, 8, 8, 11), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 8), None)],
    990: [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 8), None)],
    
    1167: [(None, datetime(2016, 5, 18, 16)), (datetime(2016, 6, 25, 7), None)],
    1225: [(None, datetime(2016, 8, 23, 12)), (datetime(2016, 10, 11, 13), None)],
    1226: [(None, datetime(2016, 8, 23, 12)), (datetime(2016, 10, 20, 12), None)],
    1232: [(None, datetime(2016, 6, 23, 18)), (datetime(2016, 8, 31, 19), None)],
    1244: [(None, datetime(2016, 7, 13, 16)), (datetime(2016, 8, 31, 19), None)],
    1246: [(datetime(2016, 3, 2, 19), None)],
    1272: [(None, datetime(2016, 9, 28, 9)), (datetime(2016, 10, 20, 12), None)],
    1273: [(None, datetime(2016, 5, 31, 16)), (datetime(2016, 6, 16, 23), None)],
} | {
    # Filters for same July and October pattern
    building_id: [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 6), None)]
    for building_id in (
        [910, 920, 923, 926, 927, 929, 931, 934, 955, 957]
        + [961, 963, 965, 967, 969, 973, 976, 989]
        + [993, 994, 996]
    )
}

to_remove_chilled_water = {
    b_id: find_constant_streaks(readings_df_train, meter_id=1, building_id=b_id, streak_length=35)
    for b_id in tqdm(
        [7, 75, 97, 98, 163, 167, 171, 172, 177, 188, 190, 191, 195, 200]
        + [207, 231, 233, 235, 260, 265, 267, 748, 750, 752, 755, 763, 776 ]
        + [786, 789, 790, 792, 801]      

        # Filters for same July and October pattern
        + [874, 890, 893, 894, 895, 896, 898, 899, 911, 915, 916, 917, 918 ]
        + [929, 932, 933, 935, 942, 951, 952, 953, 957, 958, 959, 960, 961 ]
        + [962, 964, 965, 966, 968, 971, 972, 974, 975, 978, 979, 980, 981 ]
        + [983, 987, 991, 992, 994, 995, 997 ]

        # Filters for same mid July pattern. All ids contain that period in addition to
        # other constant streak periods
        + [1223, 1225, 1226, 1227, 1229, 1230, 1233, 1234, 1235, 1236, 1238 ]
        + [1239, 1240, 1241, 1242, 1243, 1246, 1247, 1248, 1249, 1250, 1251 ]
        + [1252, 1253, 1255, 1258, 1259, 1260, 1262, 1263, 1264, 1266, 1267 ]
        + [1280, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294 ]
        + [1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1306, 1307 ]
        + [1308, 1309, 1310, 1311, 1312]
    )
}

In [None]:
# Apply filters
readings_df_train = filter_readings_data(
    readings_df_train,
    meter_id=1,
    to_keep_filters=to_keep_chilled_water,
    to_remove_filters=to_remove_chilled_water,
)

#### Steam

In [None]:
to_keep_steam = {
    751: [(datetime(2016, 2, 3, 7), None)],
    758: [(None, datetime(2016, 3, 7, 14))],
    759: [(None, datetime(2016, 1, 14, 23)), (datetime(2016, 2, 2, 8), None)],
    762: [(None, datetime(2016, 2, 25))],
    766: [(datetime(2016, 2, 3, 7), None)],
    772: [(datetime(2016, 2, 25, 7), datetime(2016, 3, 16, 7)), (datetime(2016, 12, 16, 23), None)],
    783: [(datetime(2016, 12, 9, 14), None)]
} | {
    # Same July / October pattern
    b_id: [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 6), None)]
    for b_id in (
        [880, 889, 890, 894, 901, 905, 906, 910, 911, 913, 914 ]
        + [917, 921, 924, 928, 933, 951, 953, 955, 964, 968, 971 ]
        + [973, 976, 979, 981, 987, 992, 995, 997]
    )
} | {
    # Same July / October pattern but slightly different dtes
    b_id : [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 12), None)]
    for b_id in (
        [885, 886, 887, 888, 895, 896, 893, 898, 899, 900]
        + [903, 907, 908, 912, 915, 916, 918, 920, 922, 926 ]
        + [929, 931, 932, 934, 946, 948, 949, 952, 956, 957 ]
        + [958, 959, 960, 961, 963, 965, 966, 967, 969, 972 ] 
        + [974, 978, 980, 989, 991, 996]
    )
} | {
    # Same early June pattern
    b_id: [(None, datetime(2016, 5, 31, 16)), (datetime(2016, 6, 2, 18), None)]
    for b_id in [1358, 1385, 1386, 1387, 1425, 1427]
}| {
    927: [(None, datetime(2016, 5, 26)), (datetime(2016, 6, 1, 4), datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 6), None)],
    945: [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 6), datetime(2016, 12, 2)), (datetime(2016, 12, 5, 12), None)],
    954: [(datetime(2016, 8, 8, 10), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 12), None)],
    983: [(None, datetime(2016, 5, 10)), (datetime(2016, 5, 17, 12), datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 12), None)],
    993: [(None, datetime(2016, 7, 1, 16)), (datetime(2016, 7, 5, 4), datetime(2016, 10, 15)), (datetime(2016, 10, 17, 12), datetime(2016, 12, 28))],   
    1072: [(datetime(2016, 7, 25, 12), None)],
    1176: [(datetime(2016, 2, 12, 10), None)],
    1189: [(datetime(2016, 2, 12, 10), None)],
    1250: [(datetime(2016, 12, 21, 15), None)],
    1392: [(None, datetime(2016, 5, 31, 16)), (datetime(2016, 6, 2, 18), datetime(2016, 7, 5, )), (datetime(2016, 7, 9, ), None)],
    1426: [(None, datetime(2016, 5, 31, 16)), (datetime(2016, 6, 2, 18), datetime(2016, 7, 7, 13)), (datetime(2016, 8, 1, 12), None)],
} 

to_remove_steam = {
    b_id: find_constant_streaks(readings_df_train, meter_id=2, building_id=b_id, streak_length=35)
    for b_id in tqdm(
        [750, 776, 784, 876, 886, 888, 889, 890, 894, 895, 907, 910, 912 ]
        + [916, 917, 921, 925, 928, 929, 932, 933, 942, 951, 955, 961, 962 ]
        + [965, 972, 973, 976, 978, 996, 1088, 1098, 1111, 1129, 1140 ]
        + [1158, 1174, 1176, 1189]

        # All contain same mid July pattern
        + [1225, 1226, 1238, 1239, 1241, 1243, 1245, 1247, 1248, 1249, 1254 ]
        + [1256, 1258, 1263, 1283, 1284, 1285, 1286, 1287, 1289, 1290, 1291 ]
        + [1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1301, 1303, 1305 ]
        + [1307, 1308, 1309, 1310 ]

        # Same early June pattern
        + [1329, 1336, 1337, 1338, 1339, 1341, 1342, 1343, 1344, 1345, 1346 ]
        + [1347, 1350, 1351, 1354, 1355, 1360, 1361, 1363, 1364, 1366, 1367 ]
        + [1373, 1375, 1377, 1378, 1379, 1381, 1382, 1383, 1384, 1391, 1396 ]
        + [1405, 1406, 1409, 1414, 1417, 1418, 1420, 1424, 1430, 1431, 1433 ]
        + [1434, 1437, 1438]
    )
}

In [None]:
# Apply filters
readings_df_train = filter_readings_data(
    readings_df_train,
    meter_id=2,
    to_keep_filters=to_keep_steam,
    to_remove_filters=to_remove_steam,
)

#### Hot Water

In [None]:
to_keep_hot_water = {
    163: [(None, datetime(2016, 1, 28, 19))],
    176: [(None, datetime(2016, 2, 10, 12)), (datetime(2016, 5, 5, 18), None)],
    195: [(None, datetime(2016, 2, 9, 15)), (datetime(2016, 12, 7, 13), None)],
    200: [(None, datetime(2016, 3, 19, 17))],
    220: [(None, datetime(2016, 1, 6, 22)), (datetime(2016, 1, 14), None)],
    226: [(None, datetime(2016, 9, 30, 9)), (datetime(2016, 10, 25, 6), None)],
    236: [(None, datetime(2016, 11, 20))],
    279: [(datetime(2016, 12, 31), None)],
    287: [(datetime(2016, 12, 31), None)],

}

to_remove_hot_water = {
    b_id: find_constant_streaks(readings_df_train, meter_id=3, building_id=b_id, streak_length=35)
    for b_id in tqdm(
        [113, 117, 119, 121, 138, 175, 192, 203, 284, 1223, 1224, 1227 ]
        + [1228, 1229, 1230, 1231, 1233, 1234, 1235, 1236, 1240, 1242, 1244 ]
        + [1246, 1251, 1252, 1253, 1255, 1259, 1262, 1265, 1266, 1267, 1269 ]
        + [1270, 1271, 1273, 1274, 1294, 1295, 1296, 1297, 1298, 1300, 1301 ]
        + [1311, 1312, 1317, 1318, 1319, 1321, 1321, 1322, 1323]
    )
}

In [None]:
# Apply filters
readings_df_train = filter_readings_data(
    readings_df_train,
    meter_id=3,
    to_keep_filters=to_keep_hot_water,
    to_remove_filters=to_remove_hot_water,
)

### Weather data

In [None]:
# Reindex weather data such that every site has a measurement
# for each training timestamp


def reindex_weather_data(
    weather_df: pd.DataFrame,
    start_timestamp: pd.Timestamp,
    end_timestamp: pd.Timestamp,
    freq: str = "1h"
) -> pd.DataFrame:
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])
    timestamps = pd.date_range(
        start_timestamp,
        end_timestamp,
        freq=freq,
        inclusive="both"
    )
    timestamps = pd.DatetimeIndex(timestamps, name="timestamp")
    site_dfs = []
    for site_id, site_df in weather_df.groupby("site_id", observed=True):
        site_df = site_df.set_index("timestamp").reindex(timestamps).reset_index()
        site_df["site_id"] = site_df["site_id"].fillna(value=site_id)
        site_dfs.append(site_df)

    weather_df = pd.concat(site_dfs, ignore_index=True)
    return weather_df

In [None]:
weather_df_train = reindex_weather_data(
    weather_df=weather_df_train,
    start_timestamp=MIN_TRAIN_TIMESTAMP,
    end_timestamp=MAX_TRAIN_TIMESTAMP,
)

In [None]:
# Functionality for imputing missing weather data

def interpolate(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    weather_df[column] = weather_df[column].interpolate("linear", limit=12)
    weather_df[column] = weather_df[column].ffill(limit=2)
    weather_df[column] = weather_df[column].bfill(limit=2)
    return weather_df


def _mean_weather_by_date_and_site(weather_df, column) -> pd.DataFrame:
    mean_values = (
        weather_df
        .groupby(["date", "site_id"], observed=True)[[column]]
        .mean()
        .reset_index()
    )
    return mean_values


def _merge_onto_weather_df(weather_df, right, right_suffix) -> pd.DataFrame:
    weather_df = weather_df.merge(
        right=right,
        how="left",
        on=["date", "site_id"],
        suffixes=("", right_suffix)
    )
    return weather_df


def impute_with_same_day_mean(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    
    # Fill with means from same day
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def ffill_mean_by_date(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    
    # ffill by site and date
    site_dfs = []
    for site_id, site_df in mean_values.groupby("site_id", observed=True):
        site_df = site_df.sort_values("date")
        site_df[column] = site_df[column].ffill().bfill()
        site_dfs.append(site_df)
    mean_values = pd.concat(site_dfs, ignore_index=True)
    
    # Merge back onto main and fill with mean values
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def fill_missing_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["date"] = weather_df["timestamp"].dt.date

    for column in WEATHER_FEATURE_COLUMNS:
        weather_df = interpolate(weather_df, column)
        weather_df = impute_with_same_day_mean(weather_df, column)
        weather_df = ffill_mean_by_date(weather_df, column)

    weather_df = weather_df.drop(columns=["date"])
    
    return weather_df

In [None]:
# Weather data feature engineering
# Lagged / rolling features. Compute these before merging to make merges
# less memory intensive.

def add_smoothed_weather_feature(df: pd.DataFrame, feature: str) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        site_df[f"{feature}_smoothed"] = savgol_filter(
            np.array(site_df[feature]),
            window_length=12,
            polyorder=2,
        )
        site_df[f"{feature}_smoothed"] = site_df[f"{feature}_smoothed"].astype(np.float32)
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df


def add_lagged_weather_feature(df: pd.DataFrame, feature: str, lag: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        lag_series = site_df[feature].shift(lag).astype(np.float32)
        site_df[f"{feature}_lag_{lag}"] = lag_series
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df
    

def add_rolling_mean_weather_feature(df: pd.DataFrame, feature: str, window: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        rolling_series = site_df[feature].rolling(window).mean().astype(np.float32)
        site_df[f"{feature}_rolling_mean_{window}"] = rolling_series
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df

In [None]:
# Fill missing weather data
weather_df_train = fill_missing_weather_data(weather_df_train)

In [None]:
# Air temperature
feature_name = "air_temperature"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

for window in WEATHER_ROLLING_WINDOWS:
    weather_df_train = add_rolling_mean_weather_feature(weather_df_train, feature_name, window)

In [None]:
# Dew temperature
feature_name = "dew_temperature"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

for window in WEATHER_ROLLING_WINDOWS:
    weather_df_train = add_rolling_mean_weather_feature(weather_df_train, feature_name, window)

In [None]:
# Sea level pressure
feature_name = "sea_level_pressure"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

for window in WEATHER_ROLLING_WINDOWS:
    weather_df_train = add_rolling_mean_weather_feature(weather_df_train, feature_name, window)

## Merge

In [None]:
def merge_dfs(readings_df: pd.DataFrame, buildings_df: pd.DataFrame, weather_df: pd.DataFrame) -> pd.DataFrame:

    # Merge
    merged_df = pd.merge(left=readings_df, right=buildings_df, how="left", on="building_id")
    merged_df = pd.merge(left=merged_df, right=weather_df, how="left", on=["site_id", "timestamp"])

    return merged_df

In [None]:
train_df = merge_dfs(readings_df_train, buildings_df, weather_df_train)

## Feature engineering

In [None]:
def kbtu_to_kwh(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 0.2931
    return df


def add_periodic_features(df: pd.DataFrame, feature: str, period: int) -> pd.DataFrame:
    df[f"{feature}_sin"] = np.sin(2 * np.pi * df[feature] / period)
    df[f"{feature}_sin"] = df[f"{feature}_sin"].astype(np.float32)
    
    df[f"{feature}_cos"] = np.cos(2 * np.pi * df[feature] / period).astype(np.float32)
    df[f"{feature}_cos"] = df[f"{feature}_cos"].astype(np.float32)
    
    return df


def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df["hour"] = df["timestamp"].dt.hour.astype(np.uint8)
    df = add_periodic_features(df, "hour", 24)

    df["day_of_week"] = df["timestamp"].dt.weekday.astype(np.uint8)
    df = add_periodic_features(df, "day_of_week", 7)

    df["month"] = df["timestamp"].dt.month.astype(np.uint8)
    df = add_periodic_features(df, "month", 12)

    is_weekend = (df["timestamp"].dt.weekday >= 5)
    df["is_weekend"] = is_weekend.astype(np.uint8)
    
    return df


def add_building_age_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_age_years"] = df["timestamp"].dt.year - df["year_built"]
    return df


def add_building_area_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_area_square_feet"] = df["square_feet"] * df["floor_count"]
    return df


def saturation_vapour_pressure(temperature: pd.Series) -> pd.Series:
    return 6.1094 * np.exp(17.625 * temperature / (temperature + 243.04))


def add_relative_humidity_feature(df: pd.DataFrame) -> pd.DataFrame:
    svp_air_temp = saturation_vapour_pressure(df["air_temperature"])
    svp_dew_temp = saturation_vapour_pressure(df["dew_temperature"])
    rh = 100 * svp_dew_temp / svp_air_temp
    df["relative_humidity"] = rh.astype(np.float32)
    return df


def add_cold_chill_feature(df: pd.DataFrame) -> pd.DataFrame:
    # Cold chill only defined for temps below 10C and wind speeds above 1.3 m/s
    mask = (df["air_temperature"] <= 10.0) & (df["wind_speed"] >= 1.3)
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    cold_chill = (
        13.12 
        +  0.6215 * air_temp
        - 11.37 * (3.6 * wind_speed) ** 0.16
        + 0.3965 * air_temp * (3.6 * wind_speed) ** 0.16
    )
    df.loc[mask, "cold_chill"] = cold_chill.astype(np.float32)
    return df


def add_apparent_temperature_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"].between(10, 27, inclusive="both")
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    humidity = df.loc[mask, "relative_humidity"] / 100
    pressure = humidity * 6.105 * np.exp((17.27 * air_temp) / (air_temp + 237.7))
    apparent_temp = air_temp + 0.33 * pressure - 0.7 * wind_speed - 4
    df.loc[mask, "apparent_temperature"] = apparent_temp.astype(np.float32)
    return df


def add_heat_index_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"] >= 27
    air_temp = df.loc[mask, "air_temperature"]
    humidity = df.loc[mask, "relative_humidity"]
    heat_index = (
        - 8.7847 
        + 1.6114 * air_temp 
        + 2.3385 * humidity
        - 0.1461 * air_temp * humidity
        - 0.0123 * air_temp ** 2 
        - 0.0164 * humidity ** 2
        + 2.212e-03 * air_temp ** 2 * humidity
        + 7.255e-04 * air_temp * humidity ** 2
        - 3.582e-06 * air_temp ** 2 * humidity ** 2
    )
    df.loc[mask, "heat_index"] = heat_index.astype(np.float32)
    return df


def cooling_degree_days(df: pd.DataFrame) -> pd.DataFrame:
    # https://www.investopedia.com/terms/c/colddegreeday.asp
    ...


def heating_degree_days():
    ...

In [None]:
# Timestamp features
train_df = add_temporal_features(train_df)

# Meter reading features
train_df = kbtu_to_kwh(train_df)

# Building features
train_df = add_building_age_feature(train_df)
train_df = add_building_area_feature(train_df)

# Weather features
train_df = add_relative_humidity_feature(train_df)
train_df = add_cold_chill_feature(train_df)
train_df = add_apparent_temperature_feature(train_df)
train_df = add_heat_index_feature(train_df)
train_df = add_periodic_features(train_df, "wind_direction", 360)

In [None]:
train_df.to_parquet("train_df.parquet")

## Test data

In [None]:
# Meter readings
readings_df_test = pd.read_csv(
    f"{INPUT_DATA_PATH}/test.csv",
    header=0,
    names=["row_id", "building_id", "meter_id", "timestamp"],
)
readings_df_test = cast_readings_data(readings_df_test)
readings_df_test["row_id"] = readings_df_test["row_id"].astype(np.uint32)

# Weather
weather_df_test = pd.read_csv(f"{INPUT_DATA_PATH}/weather_test.csv")
weather_df_test = cast_weather_data(weather_df_test)

In [None]:
# Reindex weather data
weather_df_test = reindex_weather_data(
    weather_df=weather_df_test,
    start_timestamp=MIN_TEST_TIMESTAMP,
    end_timestamp=MAX_TEST_TIMESTAMP,
)

# Concat train and test weather data
train_timestamp_cutoff = MIN_TEST_TIMESTAMP - pd.Timedelta("2d")
timestamp_mask = weather_df_train["timestamp"] >= train_timestamp_cutoff
weather_cols = ["timestamp", "site_id"] + WEATHER_FEATURE_COLUMNS
weather_df_test = pd.concat(
    [
        weather_df_test,
        weather_df_train[timestamp_mask][weather_cols]
        
    ],
    axis=0,
    ignore_index=True
)

# Fill missing weather data
weather_df_test = fill_missing_weather_data(weather_df_test)

In [None]:
# Compute lagged / rolling weather features

# Air temperature
feature_name = "air_temperature"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df_test = add_rolling_mean_weather_feature(weather_df_test, feature_name, window)

# Dew temperature
feature_name = "dew_temperature"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df_test = add_rolling_mean_weather_feature(weather_df_test, feature_name, window)

# Sea level pressure
feature_name = "sea_level_pressure"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df_test = add_rolling_mean_weather_feature(weather_df_test, feature_name, window)


weather_df_test = weather_df_test.sort_values(["site_id", "timestamp"]).reset_index(drop=True)

In [None]:
# Merge all dfs
test_df = merge_dfs(readings_df_test, buildings_df, weather_df_test)

### Feature engineering

In [None]:
# Timestamp features
test_df = add_temporal_features(test_df)

# Building features
test_df = add_building_age_feature(test_df)
test_df = add_building_area_feature(test_df)

# Weather features
test_df = add_relative_humidity_feature(test_df)
test_df = add_cold_chill_feature(test_df)
test_df = add_apparent_temperature_feature(test_df)
test_df = add_heat_index_feature(test_df)
test_df = add_periodic_features(test_df, "wind_direction", 360)

In [None]:
test_df.to_parquet("test_df.parquet")