### script for creating the withheld ood buildings with specific building attribute pairs
### also for creating subsampled 10k, 100k and 200k subsets without withheld buildings

In [3]:
import pandas as pd
from pathlib import Path
import os
from buildings_bench.data.buildings900K import Buildings900K
import random
import numpy as np
metadata_path = Path(os.environ.get('BUILDINGS_BENCH', ''), 'metadata_dev')

In [4]:
withheld_attributes = [
    ["weekend_opening_time", 10.75, "building_type", "RetailStripmall"],
    ["hvac_system_type", "Water source heat pumps cooling tower with boiler", "sqft", 350000],
    ["number_of_stories", 1, "building_type", "SmallHotel"],
    ["sqft", 150000, "weekday_opening_time", 5.25],
    ["weekday_opening_time", 9.25, "building_subtype", "strip_mall_restaurant20"]
]

In [5]:
# get statistics about the randomly selected attribute pairs
df = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k.csv")
for k1, v1, k2, v2 in withheld_attributes:
    k1 = "in." + k1
    k2 = "in." + k2
    mask1 = df[k1] == v1
    mask2 = df[k2] == v2
    n1 = len(df[mask1])
    n2 = len(df[mask2])
    n  = len(df[mask1 & mask2])
    print(n1, n2, n)
    print(f"{n/n1:.3f}, {n/n2:.3f}")

  df = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k.csv")


12021 70623 2629
0.219, 0.037
148 5220 55
0.372, 0.011
231055 4594 1072
0.005, 0.233
20561 4998 329
0.016, 0.066
14735 28145 1372
0.093, 0.049


In [6]:
for k1, v1, k2, v2 in withheld_attributes:
    k1 = "in." + k1
    k2 = "in." + k2
    
    mask1 = df[k1] == v1
    mask2 = df[k2] == v2
    df = df[~ (mask1 & mask2)]

In [7]:
df

Unnamed: 0,bldg_id,applicability,in.upgrade_name,in.tstat_clg_delta_f,in.tstat_clg_sp_f,in.tstat_htg_delta_f,in.tstat_htg_sp_f,in.aspect_ratio,in.building_subtype,in.county,...,out.electricity.total.energy_consumption_intensity,out.site_energy.total.energy_consumption,out.site_energy.total.energy_consumption_intensity,out.natural_gas.total.energy_consumption,out.natural_gas.total.energy_consumption_intensity,out.other_fuel.total.energy_consumption,out.other_fuel.total.energy_consumption_intensity,upgrade,weight,metadata_index
0,51065,True,Baseline,0.0,75.0,5.0,69.0,1.0,,G0101010,...,10.874603,2.003222e+05,11.446984,10014.271788,0.572244,0.0,0.0,0,7.041741,775
1,54245,True,Baseline,0.0,73.0,0.0,66.0,1.0,,G0101170,...,11.356190,1.987333e+05,11.356190,0.000000,0.000000,0.0,0.0,0,7.041741,826
2,57306,True,Baseline,0.0,74.0,0.0,68.0,1.0,,G0100830,...,19.786111,6.311944e+04,21.039815,3760.211869,1.253404,0.0,0.0,0,7.041741,868
3,59474,True,Baseline,10.0,70.0,7.0,68.0,2.0,strip_mall_restaurant10,G0100970,...,19.954667,1.936081e+06,25.814407,439375.480434,5.858340,0.0,0.0,0,1.947971,901
4,70885,True,Baseline,0.0,72.0,0.0,68.0,1.0,,G0101130,...,17.301481,1.682833e+05,22.437778,38510.234855,5.134698,0.0,0.0,0,3.030624,1070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336812,345431,True,Baseline,3.0,75.0,3.0,65.0,1.0,,G5600390,...,7.767143,2.217139e+05,12.669365,85768.377679,4.901050,0.0,0.0,0,7.041741,345595
336813,347436,True,Baseline,10.0,71.0,8.0,68.0,2.0,,G5600210,...,7.658222,5.636611e+05,15.030963,276411.674885,7.370978,0.0,0.0,0,11.467280,345596
336814,348460,True,Baseline,10.0,72.0,6.0,68.0,1.0,,G5600130,...,9.962000,6.436278e+05,17.163407,269985.433921,7.199612,0.0,0.0,0,3.030624,345597
336815,349399,True,Baseline,4.0,76.0,3.0,66.0,2.0,,G5600210,...,10.928148,1.342972e+05,17.906296,52326.375218,6.976850,0.0,0.0,0,7.041741,345599


In [8]:
df.to_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain.csv")

In [9]:
df = df.sample(n=200000)
df.to_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain_200k.csv")

In [10]:
df = df.sample(n=100000)
df.to_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain_100k.csv")

In [11]:
df = df.sample(n=10000)
df.to_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain_10k.csv")

In [12]:
df_remain = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain.csv")
df = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k.csv")

df_withheld = df[~df["bldg_id"].isin(df_remain["bldg_id"])]


  df_remain = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k_remain.csv")
  df = pd.read_csv(metadata_path / "AttributeCaps" / "comstock_340k.csv")


In [13]:
df_withheld

Unnamed: 0,bldg_id,applicability,in.upgrade_name,in.tstat_clg_delta_f,in.tstat_clg_sp_f,in.tstat_htg_delta_f,in.tstat_htg_sp_f,in.aspect_ratio,in.building_subtype,in.county,...,out.electricity.total.energy_consumption_intensity,out.site_energy.total.energy_consumption,out.site_energy.total.energy_consumption_intensity,out.natural_gas.total.energy_consumption,out.natural_gas.total.energy_consumption_intensity,out.other_fuel.total.energy_consumption,out.other_fuel.total.energy_consumption_intensity,upgrade,weight,metadata_index
147,175406,True,Baseline,3.0,71.0,7.0,68.0,2.0,strip_mall_restaurant10,G0600650,...,26.995714,5.815333e+05,33.230476,1.090850e+05,6.233430,0.000000,0.00000,0,1.947971,35308
173,216469,True,Baseline,6.0,72.0,10.0,67.0,5.0,strip_mall_restaurant10,G0600370,...,26.513889,2.515556e+06,33.540741,5.268879e+05,7.025172,0.000000,0.00000,0,1.947971,39993
225,326202,True,Baseline,3.0,73.0,8.0,67.0,4.0,strip_mall_restaurant20,G0600250,...,32.052000,1.669564e+06,44.521704,4.675021e+05,12.466722,0.000000,0.00000,0,1.947971,52530
231,331621,True,Baseline,10.0,72.0,10.0,68.0,1.0,strip_mall_restaurant10,G0600730,...,21.388889,2.264872e+06,30.198296,6.605504e+05,8.807338,0.000000,0.00000,0,1.947971,53159
234,336229,True,Baseline,7.0,71.0,8.0,67.0,2.0,strip_mall_restaurant20,G0600370,...,29.146984,7.282917e+05,41.616667,2.181673e+05,12.466701,0.000000,0.00000,0,1.947971,53683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336668,242140,True,Baseline,0.0,73.0,10.0,68.0,2.0,strip_mall_restaurant20,G5600150,...,17.695185,3.436806e+05,45.824074,5.862765e+04,7.817019,152322.222133,20.30963,0,1.947971,345449
336695,262444,True,Baseline,999.0,999.0,999.0,999.0,2.0,,G5600070,...,59.157407,1.833028e+05,61.100926,5.829162e+03,1.943054,0.000000,0.00000,0,2.904176,345476
336767,315308,True,Baseline,0.0,72.0,10.0,68.0,1.0,strip_mall_restaurant20,G5600330,...,19.983810,9.468472e+05,54.105556,1.356259e+05,7.750051,461474.999729,26.37000,0,1.947971,345550
336794,334124,True,Baseline,999.0,999.0,999.0,999.0,1.0,,G5600210,...,35.913222,4.712875e+06,62.838333,2.018901e+06,26.918674,0.000000,0.00000,0,2.904176,345577


In [14]:
df_withheld.to_csv(metadata_path / "AttributeCaps" / "comstock_340k_ood_withheld.csv")