In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from scripts.utils import reduce_mem_usage
from scripts.anomaly import anomaly_detector
from sklearn.model_selection import StratifiedKFold
import h5py
import ghalton
from scipy.stats import rankdata
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import pickle

***

In [2]:
train_data = pd.read_hdf("./data/train_data.h5", "train_data_scaled_weather")

In [3]:
leak = pd.read_hdf("./data/leak_data.h5", "leak_data_scaled_weather")

In [20]:
building_metadata = pd.read_csv("./data/building_metadata.csv")

***
## Mirrors datasets

In [21]:
summary = (train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

function to find mirror buildings between sites (for a given meter). Finds a set of #buildings_source buildings in target that best represent the set of buildings in source 

In [22]:
def find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target):
    all_selected = dict()
    primary_uses = building_metadata.query("building_id in @bld_id_source").primary_use.unique()
    for primary_use in primary_uses:
        blds_src = building_metadata.query("building_id in @bld_id_source & primary_use==@primary_use").sort_values("square_feet", ascending=False)
        blds_tgt = building_metadata.query("building_id in @bld_id_target & primary_use==@primary_use").sort_values("square_feet", ascending=False)
        for i,rowx in blds_src.iterrows():
            if len(blds_tgt)==0:
                print(f"no more buildings on target with primary_use: {primary_use}")
                break
            dist = np.inf
            selected = -1
            for j,rowy in blds_tgt.iterrows():
                if np.abs(rowx.square_feet - rowy.square_feet) < dist:
                    dist = np.abs(rowx.square_feet - rowy.square_feet)
                    selected = j
            all_selected[rowx.building_id] = blds_tgt.loc[selected].building_id
            blds_tgt.drop(selected, axis=0, inplace=True)
    return all_selected

In [24]:
def find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, sites_target, meter):
    bld_id_source = list(bld_id_source)
    mirrors = dict()
    for site_target in sites_target:
        bld_id_target = train_data.query("site_id==@site_target & meter==@meter").building_id.unique().tolist()
        mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}
        bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]
        if len(bld_id_source) == 0: break
    return mirrors

In [30]:
def validation(data):
    return (data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

***
#### mirror dataset for `site_id == 3`

In [11]:
summary.query("site_id == 3 & meter == 0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
47,3,0,Education,92,782104
48,3,0,Entertainment/public assembly,44,385016
49,3,0,Healthcare,6,52617
50,3,0,Lodging/residential,11,95871
51,3,0,Office,23,200005
52,3,0,Other,4,34857
53,3,0,Parking,1,8776
54,3,0,Public services,86,742136
55,3,0,Religious worship,1,8782
56,3,0,Retail,1,8782


In [33]:
bld_id_source = train_data.query("site_id==3 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,1,0,4,15], meter=0)

no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more

In [None]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site3.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site3.csv", index=False)

In [None]:
validation(_train_data)

In [None]:
validation(_leak)

***
#### mirror dataset for `site_id == 5 & meter == 0`

In [19]:
summary.query("site_id == 5")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
65,5,0,Education,49,427493
66,5,0,Entertainment/public assembly,18,157347
67,5,0,Healthcare,1,8784
68,5,0,Lodging/residential,1,8737
69,5,0,Manufacturing/industrial,3,26329
70,5,0,Office,11,95493
71,5,0,Other,1,8738
72,5,0,Public services,5,40832


***
#### mirror dataset for `site_id == 6 & meter == 0`

In [55]:
summary.query("site_id==6 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
73,6,0,Education,13,113891
74,6,0,Entertainment/public assembly,3,26319
75,6,0,Lodging/residential,11,96438
76,6,0,Office,8,69971
77,6,0,Public services,1,8775


In [56]:
summary.query("site_id==2 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
20,2,0,Education,61,528084
21,2,0,Entertainment/public assembly,21,182672
22,2,0,Food sales and service,2,17563
23,2,0,Healthcare,1,8745
24,2,0,Lodging/residential,12,105359
25,2,0,Office,24,210256
26,2,0,Parking,3,26332
27,2,0,Public services,6,52660
28,2,0,Religious worship,1,8022
29,2,0,Retail,1,8776


In [60]:
bld_id_source = train_data.query("site_id==6 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [66]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site6.csv", index=False)

In [67]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,0,Education,13,111263
1,2,0,Entertainment/public assembly,3,26339
2,2,0,Lodging/residential,11,96576
3,2,0,Office,8,70214
4,2,0,Public services,1,8774


In [68]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,0,Education,13,334255
1,2,0,Entertainment/public assembly,3,78758
2,2,0,Lodging/residential,11,289013
3,2,0,Office,8,203775
4,2,0,Public services,1,21906


***
#### mirror dataset for `site_id = 6 & meter == 1`

In [69]:
summary.query("site_id==6 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
78,6,1,Education,8,59751
79,6,1,Entertainment/public assembly,2,16868
80,6,1,Lodging/residential,4,35099
81,6,1,Office,6,44902
82,6,1,Public services,1,7487


In [70]:
summary.query("site_id==2 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [71]:
bld_id_source = train_data.query("site_id==6 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [72]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site6.csv", index=False)

In [73]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,8,68085
1,2,1,Entertainment/public assembly,2,17566
2,2,1,Lodging/residential,4,35128
3,2,1,Office,6,49313
4,2,1,Public services,1,8781


In [74]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,8,208245
1,2,1,Entertainment/public assembly,2,52606
2,2,1,Lodging/residential,4,105208
3,2,1,Office,6,154433
4,2,1,Public services,1,26301


***
#### mirror dataset for `site_id = 6 & meter == 2`

In [75]:
summary.query("site_id == 6 & meter == 2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
83,6,2,Education,10,80780
84,6,2,Entertainment/public assembly,2,11427
85,6,2,Lodging/residential,6,52544
86,6,2,Office,5,43839


In [76]:
summary.query("site_id == 15 & meter == 2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
211,15,2,Education,29,206114
212,15,2,Entertainment/public assembly,9,65089
213,15,2,Lodging/residential,12,80949
214,15,2,Manufacturing/industrial,1,7468
215,15,2,Office,12,89662
216,15,2,Public services,3,22413
217,15,2,Technology/science,1,7472
218,15,2,Utility,2,14944


In [77]:
bld_id_source = train_data.query("site_id==6 & meter==2").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==15 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [78]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site6.csv", index=False)

In [79]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,10,74052
1,15,2,Entertainment/public assembly,2,14940
2,15,2,Lodging/residential,6,42099
3,15,2,Office,5,37360


In [80]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,7,118635
1,15,2,Entertainment/public assembly,1,16848
2,15,2,Lodging/residential,2,34765
3,15,2,Office,2,34794


***
#### mirror dataset for `site_id = 7 & meter == 0`

In [81]:
summary.query("site_id == 7 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
87,7,0,Education,12,92152


In [82]:
summary.query("site_id == 4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [83]:
bld_id_source = train_data.query("site_id==7 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [84]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site7.csv", index=False)

In [85]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,4,0,Education,12,102432


In [86]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,4,0,Education,11,284378


***
#### mirror dataset for `site_id = 7 & meter == 1`

In [87]:
summary.query("site_id == 7 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
88,7,1,Education,15,130956


In [88]:
summary.query("site_id == 2 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [89]:
bld_id_source = train_data.query("site_id==7 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [90]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site7.csv", index=False)

In [91]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,15,131728


In [92]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,15,394528


***
#### mirror dataset for `site_id = 7 & meter == 2` 

In [93]:
summary.query("site_id==7 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
89,7,2,Education,12,104597


In [99]:
summary.query("site_id==15 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
211,15,2,Education,29,206114
212,15,2,Entertainment/public assembly,9,65089
213,15,2,Lodging/residential,12,80949
214,15,2,Manufacturing/industrial,1,7468
215,15,2,Office,12,89662
216,15,2,Public services,3,22413
217,15,2,Technology/science,1,7472
218,15,2,Utility,2,14944


In [100]:
bld_id_source = train_data.query("site_id==7 & meter==2").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==15 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [101]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site7.csv", index=False)

In [102]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,12,86635


In [103]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,10,173955


***
#### mirror dataset for `site_id = 7 & meter == 3` on `site_id == 2 & meter == 3`

In [104]:
summary.query("site_id == 7 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
90,7,3,Education,3,26344


In [105]:
summary.query("site_id == 2 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
41,2,3,Education,31,271944
42,2,3,Entertainment/public assembly,7,61473
43,2,3,Food sales and service,1,8781
44,2,3,Lodging/residential,9,78282
45,2,3,Office,6,52694
46,2,3,Public services,1,8781


In [106]:
bld_id_source = train_data.query("site_id==7 & meter==3").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [107]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site7.csv", index=False)

In [108]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,3,26141


In [109]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,3,78701


***
#### mirror dataset for `site_id = 8 & meter == 0` 

In [221]:
summary.query("site_id==8 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
91,8,0,Entertainment/public assembly,24,172977
92,8,0,Office,7,53827
93,8,0,Other,9,71246
94,8,0,Public services,28,227229
95,8,0,Warehouse/storage,2,16054


In [222]:
summary.query("site_id==0 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,0,0,Education,30,160163
1,0,0,Entertainment/public assembly,5,26695
2,0,0,Lodging/residential,27,145961
3,0,0,Office,24,124345
4,0,0,Other,5,27029
5,0,0,Parking,8,42958
6,0,0,Retail,6,32911


In [223]:
bld_id_source = train_data.query("site_id==8 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==0 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Warehouse/storage


In [224]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [225]:
len(bld_id_source)

53

In [226]:
summary.query("site_id==2 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
20,2,0,Education,61,528084
21,2,0,Entertainment/public assembly,21,182672
22,2,0,Food sales and service,2,17563
23,2,0,Healthcare,1,8745
24,2,0,Lodging/residential,12,105359
25,2,0,Office,24,210256
26,2,0,Parking,3,26332
27,2,0,Public services,6,52660
28,2,0,Religious worship,1,8022
29,2,0,Retail,1,8776


In [227]:
bld_id_target = train_data.query("site_id==2 & meter==0").building_id.unique().tolist()

In [228]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services


In [229]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [230]:
len(bld_id_source)

26

In [231]:
bld_id_target = train_data.query("site_id==15 & meter==0").building_id.unique().tolist()

In [232]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services


In [233]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [234]:
len(bld_id_source)

20

In [235]:
bld_id_target = train_data.query("site_id==1 & meter==0").building_id.unique().tolist()

In [236]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services


In [237]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [238]:
len(bld_id_source)

18

In [239]:
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()

In [240]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services


In [241]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [242]:
len(bld_id_source)

12

In [243]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site8.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site8.csv", index=False)

In [244]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Entertainment/public assembly,24,192371
1,0,Office,7,37838
2,0,Other,5,27029
3,0,Public services,20,165655
4,0,Warehouse/storage,2,17316


In [245]:
(_leak.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Entertainment/public assembly,24,609596
1,0,Office,7,160376
2,0,Other,5,114272
3,0,Public services,18,427948
4,0,Warehouse/storage,2,51592


***
#### mirror dataset for `site_id = 9 & meter == 0` 

In [246]:
summary.query("site_id==9 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
96,9,0,Education,63,527525
97,9,0,Entertainment/public assembly,17,143242
98,9,0,Lodging/residential,19,162302
99,9,0,Office,16,131285
100,9,0,Public services,2,17118
101,9,0,Services,5,42490


In [247]:
summary.query("site_id==2 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
20,2,0,Education,61,528084
21,2,0,Entertainment/public assembly,21,182672
22,2,0,Food sales and service,2,17563
23,2,0,Healthcare,1,8745
24,2,0,Lodging/residential,12,105359
25,2,0,Office,24,210256
26,2,0,Parking,3,26332
27,2,0,Public services,6,52660
28,2,0,Religious worship,1,8022
29,2,0,Retail,1,8776


In [248]:
bld_id_source = train_data.query("site_id==9 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Services


In [249]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [250]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [251]:
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()

In [252]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Lodging/residential


In [253]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [254]:
len(bld_id_source)

8

there are not buildings with primary_use='Services' in leak data. I will select 5 buildings from `site=0` & `primary_use="Public services"` to do the trick

In [255]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [256]:
extra_buildings = train_data.query("site_id==4 & meter==0 & primary_use=='Public services'").building_id.unique()[:5].tolist()

In [258]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==0")
_extra_train_data["primary_use"] = 'Services'
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter0_site9.csv", index=False)

_leak = leak.query("building_id in @mirrors.values() & meter==0")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==0")
_extra_leak_data["primary_use"] = 'Services'
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter0_site9.csv", index=False)

In [259]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,63,545603
1,0,Entertainment/public assembly,17,147966
2,0,Lodging/residential,16,135032
3,0,Office,16,140360
4,0,Public services,2,17557
5,0,Services,5,43914


In [260]:
(_leak.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,62,1612141
1,0,Entertainment/public assembly,17,442790
2,0,Lodging/residential,13,341581
3,0,Office,16,414383
4,0,Public services,2,48205
5,0,Services,5,129330


***
#### mirror dataset for `site_id = 9 & meter == 1` 

In [263]:
summary.query("site_id==9 & meter == 1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
102,9,1,Education,55,482335
103,9,1,Entertainment/public assembly,14,122770
104,9,1,Lodging/residential,13,114005
105,9,1,Office,11,96468
106,9,1,Public services,2,17535


In [264]:
summary.query("site_id==2 & meter == 1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [265]:
bld_id_source = train_data.query("site_id==9 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly


In [266]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [267]:
len(bld_id_source)

9

In [275]:
summary.query("site_id==0 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
7,0,1,Education,12,86849
8,0,1,Lodging/residential,4,26090
9,0,1,Office,6,42548
10,0,1,Other,1,5147
11,0,1,Retail,1,7256


In [276]:
bld_id_target = train_data.query("site_id==0 & meter==1").building_id.unique().tolist()

In [277]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Entertainment/public assembly


In [279]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [280]:
len(bld_id_source)

5

In [281]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site9.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site9.csv", index=False)

In [282]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,55,476286
1,1,Entertainment/public assembly,9,79024
2,1,Lodging/residential,13,112379
3,1,Office,11,96602
4,1,Public services,2,17563


In [283]:
(_leak.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,54,1413584
1,1,Entertainment/public assembly,9,236704
2,1,Lodging/residential,13,340139
3,1,Office,11,289322
4,1,Public services,2,52603


***
#### mirror dataset for `site_id = 9 & meter == 2`  **

In [306]:
summary.query("site_id==9 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
107,9,2,Education,53,465059
108,9,2,Entertainment/public assembly,13,114076
109,9,2,Lodging/residential,13,110559
110,9,2,Office,8,70219
111,9,2,Public services,2,17549


In [307]:
summary.query("site_id==15 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
211,15,2,Education,29,206114
212,15,2,Entertainment/public assembly,9,65089
213,15,2,Lodging/residential,12,80949
214,15,2,Manufacturing/industrial,1,7468
215,15,2,Office,12,89662
216,15,2,Public services,3,22413
217,15,2,Technology/science,1,7472
218,15,2,Utility,2,14944


In [308]:
bld_id_source = train_data.query("site_id==9 & meter==2").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==15 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly


In [309]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [310]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site9.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site9.csv", index=False)

In [311]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,29,206114
1,15,2,Entertainment/public assembly,9,65089
2,15,2,Lodging/residential,12,80949
3,15,2,Office,8,59775
4,15,2,Public services,2,14941


In [312]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,22,374185
1,15,2,Entertainment/public assembly,5,86436
2,15,2,Lodging/residential,6,102819
3,15,2,Office,4,69586


***
#### mirror dataset for `site_id = 10 & meter == 0` 

In [313]:
summary.query("site_id==10 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
112,10,0,Education,14,109409
113,10,0,Entertainment/public assembly,4,34570
114,10,0,Lodging/residential,3,26305
115,10,0,Office,5,37169
116,10,0,Other,3,25933
117,10,0,Technology/science,1,2696


In [320]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [321]:
bld_id_source = train_data.query("site_id==10 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Office


In [322]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [324]:
len(bld_id_source)

8

In [323]:
summary.query("site_id==2 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
20,2,0,Education,61,528084
21,2,0,Entertainment/public assembly,21,182672
22,2,0,Food sales and service,2,17563
23,2,0,Healthcare,1,8745
24,2,0,Lodging/residential,12,105359
25,2,0,Office,24,210256
26,2,0,Parking,3,26332
27,2,0,Public services,6,52660
28,2,0,Religious worship,1,8022
29,2,0,Retail,1,8776


In [325]:
bld_id_target = train_data.query("site_id==2 & meter==0").building_id.unique().tolist()

In [326]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Other


In [327]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [328]:
len(bld_id_source)

3

there are no buildings with `primary_use="Other"` in leak data.

In [329]:
extra_buildings = train_data.query("site_id==4 & meter==0 & primary_use=='Parking'").building_id.unique().tolist()

In [332]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==0")
_extra_train_data["primary_use"] = 'Other'
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter0_site10.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==0")
_extra_leak_data["primary_use"] = 'Other'
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter0_site10.csv", index=False)

In [333]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,14,119670
1,0,Entertainment/public assembly,4,32062
2,0,Lodging/residential,3,20890
3,0,Office,5,43750
4,0,Other,3,20589
5,0,Technology/science,1,8775


In [334]:
(_leak.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,10,262651
1,0,Entertainment/public assembly,4,102007
2,0,Office,5,129106
3,0,Other,2,46657
4,0,Technology/science,1,26304


***
#### mirror dataset for `site_id = 10 & meter == 1`

In [356]:
summary.query("site_id==10 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
118,10,1,Education,6,52634
119,10,1,Other,2,17527
120,10,1,Technology/science,1,8774


In [357]:
summary.query("site_id==2 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [358]:
bld_id_source = train_data.query("site_id==10 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Technology/science


In [359]:
extra_buildings  = train_data.query("site_id==2 & meter==1 & primary_use in ('Healthcare', 'Food sales and service')").building_id.unique().tolist()

In [360]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==1")
_extra_train_data["primary_use"] = _extra_train_data.primary_use.map({"Healthcare":"Technology/science", "Food sales and service":"Other"})
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter1_site10.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==1")
_extra_leak_data["primary_use"] = _extra_leak_data.primary_use.map({"Healthcare":"Technology/science", "Food sales and service":"Other"})
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter1_site10.csv", index=False)

In [361]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,6,52696
1,2,1,Other,2,17564
2,2,1,Technology/science,1,8783


In [362]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,6,157816
1,2,1,Other,2,52604
2,2,1,Technology/science,1,26303


***
#### mirror dataset for `site_id = 10 & meter == 3` 

In [363]:
summary.query("site_id == 10 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
121,10,3,Education,9,78350
122,10,3,Entertainment/public assembly,1,8775
123,10,3,Technology/science,1,8774


In [375]:
summary.query("site_id==2 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
41,2,3,Education,31,271944
42,2,3,Entertainment/public assembly,7,61473
43,2,3,Food sales and service,1,8781
44,2,3,Lodging/residential,9,78282
45,2,3,Office,6,52694
46,2,3,Public services,1,8781


In [377]:
bld_id_source = train_data.query("site_id==10 & meter==3").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Technology/science


In [378]:
extra_buildings  = train_data.query("site_id==2 & meter==3 & primary_use=='Public services'").building_id.unique().tolist()

In [380]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==3")
_extra_train_data["primary_use"] = "Technology/science"
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter3_site10.csv", index=False)


_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==3")
_extra_leak_data["primary_use"] = "Technology/science"
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter3_site10.csv", index=False)

In [381]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,9,79023
1,2,3,Entertainment/public assembly,1,8783
2,2,3,Technology/science,1,8781


In [384]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,9,236703
1,2,3,Entertainment/public assembly,1,26303
2,2,3,Technology/science,1,26301


***
#### mirror dataset for `site_id = 11 & meter==0` 

In [386]:
summary.query("site_id==11 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
124,11,0,Education,5,43400


In [387]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [388]:
bld_id_source = train_data.query("site_id==11 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [390]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site11.csv", index=False)

In [391]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,4,0,Education,5,43583


In [392]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,4,0,Education,5,131198


***
#### mirror dataset for `site_id = 11 & meter==1` 

In [394]:
summary.query("site_id==11 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
125,11,1,Education,4,32656


In [395]:
summary.query("site_id==2 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [396]:
bld_id_source = train_data.query("site_id==11 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [397]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site11.csv", index=False)

In [398]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,4,35127


In [399]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,1,Education,4,105207


***
#### mirror dataset for `site_id = 11 & meter==3` 

In [401]:
summary.query("site_id==11 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
126,11,3,Education,5,43403


In [402]:
summary.query("site_id==2 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
41,2,3,Education,31,271944
42,2,3,Entertainment/public assembly,7,61473
43,2,3,Food sales and service,1,8781
44,2,3,Lodging/residential,9,78282
45,2,3,Office,6,52694
46,2,3,Public services,1,8781


In [403]:
bld_id_source = train_data.query("site_id==11 & meter==3").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

In [406]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site11.csv", index=False)

In [407]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,5,43717


In [408]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,2,3,Education,5,131317


***
#### mirror dataset for `site_id = 12 & meter==0` 

In [411]:
summary.query("site_id==12 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
127,12,0,Education,20,174519
128,12,0,Entertainment/public assembly,2,17283
129,12,0,Office,9,79034
130,12,0,Public services,1,8784
131,12,0,Retail,3,26090
132,12,0,Technology/science,1,8779


In [415]:
summary.query("site_id==0 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,0,0,Education,30,160163
1,0,0,Entertainment/public assembly,5,26695
2,0,0,Lodging/residential,27,145961
3,0,0,Office,24,124345
4,0,0,Other,5,27029
5,0,0,Parking,8,42958
6,0,0,Retail,6,32911


In [416]:
bld_id_source = train_data.query("site_id==12 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==0 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Technology/science
no more buildings on target with primary_use: Public services


In [417]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [418]:
len(bld_id_source)

2

In [419]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [420]:
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()

In [422]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

In [424]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site12.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site12.csv", index=False)

In [425]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,20,106115
1,0,Entertainment/public assembly,2,10806
2,0,Office,9,48651
3,0,Public services,1,8783
4,0,Retail,3,16216
5,0,Technology/science,1,8769


In [426]:
(_leak.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,20,451718
1,0,Entertainment/public assembly,2,45816
2,0,Office,9,202669
3,0,Public services,1,26304
4,0,Retail,3,68750
5,0,Technology/science,1,26304


***
#### mirror dataset for `site_id = 13 & meter==0` 

In [430]:
summary.query("site_id==13 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
133,13,0,Education,23,196465
134,13,0,Entertainment/public assembly,6,52528
135,13,0,Food sales and service,1,8784
136,13,0,Healthcare,3,26321
137,13,0,Lodging/residential,10,87537
138,13,0,Manufacturing/industrial,5,43919
139,13,0,Office,70,599934
140,13,0,Other,3,26338
141,13,0,Parking,7,61286
142,13,0,Public services,5,43911


In [433]:
summary.query("site_id==2 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
20,2,0,Education,61,528084
21,2,0,Entertainment/public assembly,21,182672
22,2,0,Food sales and service,2,17563
23,2,0,Healthcare,1,8745
24,2,0,Lodging/residential,12,105359
25,2,0,Office,24,210256
26,2,0,Parking,3,26332
27,2,0,Public services,6,52660
28,2,0,Religious worship,1,8022
29,2,0,Retail,1,8776


In [434]:
bld_id_source = train_data.query("site_id==13 & meter==0").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Technology/science
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare


In [435]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [436]:
len(bld_id_source)

66

In [437]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [438]:
bld_id_target = train_data.query("site_id==4 & meter==0").building_id.unique().tolist()

In [439]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare


In [440]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [441]:
len(bld_id_source)

62

In [443]:
summary.query("site_id==0 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,0,0,Education,30,160163
1,0,0,Entertainment/public assembly,5,26695
2,0,0,Lodging/residential,27,145961
3,0,0,Office,24,124345
4,0,0,Other,5,27029
5,0,0,Parking,8,42958
6,0,0,Retail,6,32911


In [444]:
bld_id_target = train_data.query("site_id==0 & meter==0").building_id.unique().tolist()

In [445]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare


In [446]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [447]:
len(bld_id_source)

34

In [448]:
summary.query("site_id==1 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
12,1,0,Education,22,193244
13,1,0,Entertainment/public assembly,1,8784
14,1,0,Lodging/residential,10,87840
15,1,0,Office,16,140542
16,1,0,Public services,2,17568


In [449]:
bld_id_target = train_data.query("site_id==1 & meter==0").building_id.unique().tolist()

In [450]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare


In [451]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [452]:
len(bld_id_source)

18

In [455]:
summary.query("site_id==15 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
193,15,0,Education,41,296715
194,15,0,Entertainment/public assembly,15,102254
195,15,0,Lodging/residential,28,207928
196,15,0,Manufacturing/industrial,2,14944
197,15,0,Office,18,132215
198,15,0,Public services,6,44825
199,15,0,Religious worship,1,7471
200,15,0,Technology/science,1,7472
201,15,0,Utility,2,14943


In [456]:
bld_id_target = train_data.query("site_id==15 & meter==0").building_id.unique().tolist()

In [457]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare


In [458]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [459]:
len(bld_id_source)

10

In [460]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site13.csv", index=False)

In [461]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,23,201707
1,0,Entertainment/public assembly,6,51584
2,0,Food sales and service,1,8782
3,0,Healthcare,1,8745
4,0,Lodging/residential,10,87795
5,0,Manufacturing/industrial,2,14944
6,0,Office,70,519887
7,0,Other,3,16217
8,0,Parking,7,52327
9,0,Public services,5,43894



***
#### mirror dataset for `site_id = 13 & meter==1` 

In [463]:
summary.query("site_id==13 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
146,13,1,Education,18,158021
147,13,1,Entertainment/public assembly,5,43920
148,13,1,Healthcare,3,26352
149,13,1,Lodging/residential,2,17568
150,13,1,Office,46,404049
151,13,1,Parking,1,8781
152,13,1,Public services,5,43916


In [464]:
summary.query("site_id==2 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
32,2,1,Education,52,454484
33,2,1,Entertainment/public assembly,9,79024
34,2,1,Food sales and service,2,17564
35,2,1,Healthcare,1,8783
36,2,1,Lodging/residential,12,105382
37,2,1,Office,17,145916
38,2,1,Public services,4,35126
39,2,1,Retail,1,8783
40,2,1,Utility,1,8783


In [465]:
bld_id_source = train_data.query("site_id==13 & meter==1").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==2 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Parking


In [466]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [467]:
len(bld_id_source)

33

In [482]:
summary.query("site_id==0 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
7,0,1,Education,12,86849
8,0,1,Lodging/residential,4,26090
9,0,1,Office,6,42548
10,0,1,Other,1,5147
11,0,1,Retail,1,7256


In [484]:
bld_id_target = train_data.query("site_id==0 & meter==1").building_id.unique().tolist()

In [485]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Healthcare


In [486]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [487]:
len(bld_id_source)

27

In [488]:
summary.query("site_id==15 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
202,15,1,Education,23,168142
203,15,1,Entertainment/public assembly,7,47931
204,15,1,Healthcare,1,7332
205,15,1,Lodging/residential,13,89054
206,15,1,Manufacturing/industrial,1,6677
207,15,1,Office,16,116739
208,15,1,Public services,2,14417
209,15,1,Religious worship,1,7327
210,15,1,Technology/science,1,7332


In [489]:
bld_id_target = train_data.query("site_id==15 & meter==1").building_id.unique().tolist()

In [490]:
mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Healthcare


In [491]:
bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]

In [492]:
len(bld_id_source)

9

In [493]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site13.csv", index=False)

In [494]:
(_train_data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,18,155905
1,1,Entertainment/public assembly,5,43896
2,1,Healthcare,2,16115
3,1,Lodging/residential,2,17564
4,1,Office,39,305203
5,1,Public services,5,42457


***
#### mirror dataset for `site_id = 13 & meter==2` **

In [497]:
summary.query("site_id==13 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
153,13,2,Education,17,149202
154,13,2,Entertainment/public assembly,5,43761
155,13,2,Food sales and service,1,8784
156,13,2,Healthcare,3,26187
157,13,2,Lodging/residential,8,70272
158,13,2,Manufacturing/industrial,3,26352
159,13,2,Office,41,360098
160,13,2,Other,1,8784
161,13,2,Parking,2,17568
162,13,2,Public services,4,35136


In [500]:
summary.query("site_id==15 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
211,15,2,Education,29,206114
212,15,2,Entertainment/public assembly,9,65089
213,15,2,Lodging/residential,12,80949
214,15,2,Manufacturing/industrial,1,7468
215,15,2,Office,12,89662
216,15,2,Public services,3,22413
217,15,2,Technology/science,1,7472
218,15,2,Utility,2,14944


In [508]:
bld_id_source = train_data.query("site_id==13 & meter==2").building_id.unique().tolist()
bld_id_target = train_data.query("site_id==15 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)

no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Food sales and service
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Other


In [509]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site13.csv", index=False)

In [510]:
(_train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,17,116477
1,15,2,Entertainment/public assembly,5,35210
2,15,2,Lodging/residential,8,55689
3,15,2,Manufacturing/industrial,1,7468
4,15,2,Office,12,89662
5,15,2,Public services,3,22413


In [512]:
(_leak.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
0,15,2,Education,13,226133
1,15,2,Entertainment/public assembly,2,34794
2,15,2,Lodging/residential,4,69558
3,15,2,Manufacturing/industrial,1,17395
4,15,2,Office,6,104381


***
#### mirror dataset for `site_id = 14 & meter==0`

In [514]:
summary.query("site_id==14 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
165,14,0,Education,26,212273
166,14,0,Entertainment/public assembly,10,84836
167,14,0,Food sales and service,2,16820
168,14,0,Healthcare,10,81820
169,14,0,Lodging/residential,9,77275
170,14,0,Office,38,317928
171,14,0,Public services,7,59278


***
#### mirror dataset for `site_id = 14 & meter==1`

In [515]:
summary.query("site_id==14 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
172,14,1,Education,23,197297
173,14,1,Entertainment/public assembly,7,61150
174,14,1,Food sales and service,2,17568
175,14,1,Healthcare,8,68857
176,14,1,Lodging/residential,8,70272
177,14,1,Office,31,266477
178,14,1,Public services,7,61396


***
#### mirror dataset for `site_id = 14 & meter==2`