# Script for splitting iid train/val sets from the tuning set

In [1]:
import pandas as pd
from pathlib import Path
import os
from buildings_bench.data.buildings900K import Buildings900K

In [2]:
metadata_path = Path("/projects/foundation/eulp/v1.1.0/BuildingsBench/metadata_dev")

In [None]:
g_weather_features = ['temperature', 'humidity', 'wind_speed', 'wind_direction', 'global_horizontal_radiation', 
                              'direct_normal_radiation', 'diffuse_horizontal_radiation']
dataset_path = Path(os.environ.get('BUILDINGS_BENCH', ''))
dataset = Buildings900K(dataset_path,
                       index_file=metadata_path / "comcap_350k_tune.idx",
                       context_len=0,
                       pred_len=-1,
                       weather=g_weather_features,
                       use_com_buildings_chars=True,
                       use_text_embedding=False,
                       building_description=False,
                       surrogate_mode=True)

In [4]:
from collections import defaultdict
import numpy as np

data = {
    "train_set": defaultdict(list),
    "val_set"  : defaultdict(list),
}

features = ["day_of_year", "day_of_week", "hour_of_day", \
                    "load", "building_char", "building_id", "dataset_id"] + g_weather_features

for i in range(len(dataset)):
    if i % 1000 == 0:
        print(i)
        
    building_data = dataset[i]
    
    random_idx = np.random.choice(8734, 110, replace=False) # 8734 total possible hours, 100 hours train, 10 hours val
    train_idx, val_idx = random_idx[:100], random_idx[100:]
    
    # for train_set
    for feature in features:
        if feature in ["building_id", "dataset_id"]:
            data["train_set"][feature].append(np.repeat(np.array([[building_data[feature]]]), len(train_idx), axis=0))
        else:
            data["train_set"][feature].append(building_data[feature][train_idx])
            
    # for val_set
    for feature in features:
        if feature in ["building_id", "dataset_id"]:
            data["val_set"][feature].append(np.repeat(np.array([[building_data[feature]]]), len(val_idx), axis=0))
        else:
            data["val_set"][feature].append(building_data[feature][val_idx])
            
for split_name in ["train_set", "val_set"]:
    for feature in features:
        data[split_name][feature] = np.vstack(data[split_name][feature])

    for feature in features:
        print(split_name, data[split_name][feature].shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 19)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
train_set (998900, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 19)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)
val_set (99890, 1)


In [5]:
X_feature = ["day_of_year", "day_of_week", "hour_of_day"] + g_weather_features + ["building_char"]
X_train = np.hstack([data["train_set"][f] for f in X_feature])
X_train.shape

(998900, 29)

In [6]:
X_val = np.hstack([data["val_set"][f] for f in X_feature])
X_val.shape

(99890, 29)

In [7]:
np.savez(metadata_path / "comcap_tune_X_train.npz", data=X_train)
np.savez(metadata_path / "comcap_tune_X_val.npz", data=X_val)

In [8]:
Y_train = data["train_set"]["load"]
Y_val = data["val_set"]["load"]
np.savez(metadata_path / "comcap_tune_Y_train.npz", data=Y_train)
np.savez(metadata_path / "comcap_tune_Y_val.npz", data=Y_val)

In [9]:
meta_train = np.hstack([data["train_set"][f] for f in ["building_id", "dataset_id"]])
meta_val = np.hstack([data["val_set"][f] for f in ["building_id", "dataset_id"]])

np.savez(metadata_path / "comcap_tune_meta_train.npz", data=meta_train)
np.savez(metadata_path / "comcap_tune_meta_val.npz", data=meta_val)

In [11]:
from sklearn.model_selection import train_test_split

# use only 10% of training data
X_train_small, _, Y_train_small, _, meta_train_small, _ = train_test_split(X_train, Y_train, meta_train, test_size=0.9, shuffle=True, random_state=0)
print(X_train_small.shape, Y_train_small.shape, meta_train_small.shape)

(99890, 29) (99890, 1) (99890, 2)


In [12]:
np.savez(metadata_path / "comcap_tune_small_X_train.npz", data=X_train_small)
np.savez(metadata_path / "comcap_tune_small_Y_train.npz", data=Y_train_small)
np.savez(metadata_path / "comcap_tune_small_meta_train.npz", data=meta_train_small)

In [10]:
# import numpy as np

# X = np.load(metadata_path / "comcap_X.npz")["X"]
# Y = np.load(metadata_path / "comcap_Y.npz")["Y"]
# meta = np.load(metadata_path / "comcap_meta.npz")["meta"]

In [12]:
# from shaphypetune import BoostSearch, BoostRFE, BoostRFA, BoostBoruta
# from sklearn.model_selection import train_test_split
# from lightgbm import *

# X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
#     X, Y, meta, test_size=50000, shuffle=True, random_state=0)

In [13]:
# print("X_train", X_train.shape)
# print("X_test", X_test.shape)
# print("y_train", y_train.shape)
# print("y_test", y_test.shape)
# print("meta_train", meta_train.shape)
# print("meta_test", meta_test.shape)

X_train (299425, 32)
X_test (50000, 32)
y_train (299425, 1)
y_test (50000, 1)
meta_train (299425, 2)
meta_test (50000, 2)


In [14]:
# X_test, X_val, y_test, y_val, meta_test, meta_val = train_test_split(
#     X_test, y_test, meta_test, test_size=40000, shuffle=True, random_state=0)
# print("X_test", X_test.shape)
# print("X_val", X_val.shape)
# print("y_test", y_test.shape)
# print("y_val", y_val.shape)
# print("meta_test", meta_test.shape)
# print("meta_val", meta_val.shape)

X_test (40000, 32)
X_val (10000, 32)
y_test (40000, 1)
y_val (10000, 1)
meta_test (40000, 2)
meta_val (10000, 2)


In [15]:
# np.savez(metadata_path / "comcap_X_train.npz", data=X_train)
# np.savez(metadata_path / "comcap_X_test.npz", data=X_test)
# np.savez(metadata_path / "comcap_X_val.npz", data=X_val)
# np.savez(metadata_path / "comcap_Y_train.npz", data=y_train)
# np.savez(metadata_path / "comcap_Y_test.npz", data=y_test)
# np.savez(metadata_path / "comcap_Y_val.npz", data=y_val)
# np.savez(metadata_path / "comcap_meta_train.npz", data=meta_train)
# np.savez(metadata_path / "comcap_meta_test.npz", data=meta_test)
# np.savez(metadata_path / "comcap_meta_val.npz", data=meta_val)

In [3]:
# import numpy as np

# X_train = np.load(metadata_path / "comcap_tune_X_train.npz")["data"]
# Y_train = np.load(metadata_path / "comcap_tune_Y_train.npz")["data"]
# X_val = np.load(metadata_path / "comcap_tune_X_val.npz")["data"]
# Y_val = np.load(metadata_path / "comcap_tune_Y_val.npz")["data"]

In [5]:
# X = np.vstack([X_train, X_val])
# Y = np.vstack([Y_train, Y_val])

In [10]:
# from sklearn.model_selection import train_test_split

# X_small, _, Y_small, _ = train_test_split(X, Y, test_size=0.99, random_state=0)

In [11]:
# X_small_train, X_small_val, Y_small_train, Y_small_val = train_test_split(X_small, Y_small, test_size=0.2, random_state=0)

In [12]:
# np.savez(metadata_path / "comcap_tune_X_small_train.npz", data=X_small_train)
# np.savez(metadata_path / "comcap_tune_Y_small_train.npz", data=Y_small_train)
# np.savez(metadata_path / "comcap_tune_X_small_val.npz", data=X_small_val)
# np.savez(metadata_path / "comcap_tune_Y_small_val.npz", data=Y_small_val)
