In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from scripts.utils import reduce_mem_usage
from scripts.anomaly import anomaly_detector
from sklearn.model_selection import StratifiedKFold
import h5py
import ghalton
from scipy.stats import rankdata
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import pickle

In [2]:
def validation(data):
    return (data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

***
# Source data

In [3]:
train_data = pd.read_hdf("./data/train_data.h5", "train_data_scaled_weather")

In [4]:
leak = pd.read_hdf("./data/leak_data.h5", "leak_data_scaled_weather")

In [5]:
building_metadata = pd.read_csv("./data/building_metadata.csv")

***
#### `site_id == 0`

In [8]:
train_data.query("site_id == 0").meter.unique()

array([0, 1])

In [9]:
_train_data = train_data.query("site_id==0 & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site0.csv", index=False)

_leak =  leak.query("site_id==0 & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site0.csv", index=False)

In [10]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,30,160163
1,0,Entertainment/public assembly,5,26695
2,0,Lodging/residential,27,145961
3,0,Office,24,124345
4,0,Other,5,27029
5,0,Parking,8,42958
6,0,Retail,6,32911


In [11]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,30,671637
1,0,Entertainment/public assembly,5,114219
2,0,Lodging/residential,27,618944
3,0,Office,24,528984
4,0,Other,5,114272
5,0,Parking,8,178790
6,0,Retail,6,137963


In [12]:
_train_data = train_data.query("site_id==0 & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site0.csv", index=False)

_leak =  leak.query("site_id==0 & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site0.csv", index=False)

In [13]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,12,86849
1,1,Lodging/residential,4,26090
2,1,Office,6,42548
3,1,Other,1,5147
4,1,Retail,1,7256


In [14]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,12,297089
1,1,Lodging/residential,4,96533
2,1,Office,6,147668
3,1,Other,1,22667
4,1,Retail,1,24776


***
#### `site_id == 1`

In [15]:
train_data.query("site_id == 1").meter.unique()

array([0, 3])

In [16]:
_train_data = train_data.query("site_id==1 & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site1.csv", index=False)

_leak =  leak.query("site_id==1 & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site1.csv", index=False)

In [17]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,22,193244
1,0,Entertainment/public assembly,1,8784
2,0,Lodging/residential,10,87840
3,0,Office,16,140542
4,0,Public services,2,17568


In [18]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,22,565063
1,0,Entertainment/public assembly,1,25500
2,0,Lodging/residential,10,262327
3,0,Office,15,393964
4,0,Public services,2,52608


In [19]:
_train_data = train_data.query("site_id==1 & meter==3")
_leak =  leak.query("site_id==1 & meter==3")

In [20]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,8,70245
1,3,Office,2,17567
2,3,Public services,2,17563


In [21]:
validation(_leak)

Unnamed: 0,index,n_buildings,n_rows


***
#### `site_id == 2`

In [22]:
train_data.query("site_id == 2").meter.unique()

array([0, 1, 3])

In [23]:
_train_data = train_data.query("site_id==2 & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site2.csv", index=False)

_leak =  leak.query("site_id==2 & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site2.csv", index=False)

In [24]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,61,528084
1,0,Entertainment/public assembly,21,182672
2,0,Food sales and service,2,17563
3,0,Healthcare,1,8745
4,0,Lodging/residential,12,105359
5,0,Office,24,210256
6,0,Parking,3,26332
7,0,Public services,6,52660
8,0,Religious worship,1,8022
9,0,Retail,1,8776


In [25]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,60,1559533
1,0,Entertainment/public assembly,21,545999
2,0,Food sales and service,2,52562
3,0,Healthcare,1,26261
4,0,Lodging/residential,12,315313
5,0,Office,24,620730
6,0,Parking,3,78841
7,0,Public services,6,152085
8,0,Religious worship,1,25094
9,0,Retail,1,26267


In [26]:
_train_data = train_data.query("site_id==2 & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site2.csv", index=False)

_leak =  leak.query("site_id==2 & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site2.csv", index=False)

In [27]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,52,454484
1,1,Entertainment/public assembly,9,79024
2,1,Food sales and service,2,17564
3,1,Healthcare,1,8783
4,1,Lodging/residential,12,105382
5,1,Office,17,145916
6,1,Public services,4,35126
7,1,Retail,1,8783
8,1,Utility,1,8783


In [28]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,51,1339222
1,1,Entertainment/public assembly,9,236704
2,1,Food sales and service,2,52604
3,1,Healthcare,1,26303
4,1,Lodging/residential,12,315622
5,1,Office,17,443756
6,1,Public services,4,105206
7,1,Retail,1,26303
8,1,Utility,1,26303


In [238]:
_train_data = train_data.query("site_id==2 & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site2.csv", index=False)

_leak =  leak.query("site_id==2 & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site2.csv", index=False)

In [239]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,31,271944
1,3,Entertainment/public assembly,7,61473
2,3,Food sales and service,1,8781
3,3,Lodging/residential,9,78282
4,3,Office,6,52694
5,3,Public services,1,8781


In [240]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,31,815064
1,3,Entertainment/public assembly,7,184113
2,3,Food sales and service,1,26301
3,3,Lodging/residential,9,235962
4,3,Office,6,157814
5,3,Public services,1,26301


***
#### `site_id == 4`

In [32]:
train_data.query("site_id == 4").meter.unique()

array([0])

In [33]:
_train_data = train_data.query("site_id==4 & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site4.csv", index=False)

_leak =  leak.query("site_id==4 & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site4.csv", index=False)

In [34]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,66,553524
1,0,Entertainment/public assembly,9,62185
2,0,Lodging/residential,4,29673
3,0,Parking,3,20589
4,0,Public services,6,50602
5,0,Technology/science,2,17544
6,0,Utility,1,8750


In [35]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,55,1424649
1,0,Entertainment/public assembly,7,175428
2,0,Lodging/residential,1,26268
3,0,Parking,2,46657
4,0,Public services,6,153684
5,0,Technology/science,2,52608
6,0,Utility,1,26267


***
#### `site_id == 15`

In [36]:
train_data.query("site_id == 15").meter.unique()

array([3, 0, 1, 2])

In [37]:
_train_data = train_data.query("site_id==15 & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site15.csv", index=False)

_leak =  leak.query("site_id==15 & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site15.csv", index=False)

In [38]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,41,296715
1,0,Entertainment/public assembly,15,102254
2,0,Lodging/residential,28,207928
3,0,Manufacturing/industrial,2,14944
4,0,Office,18,132215
5,0,Public services,6,44825
6,0,Religious worship,1,7471
7,0,Technology/science,1,7472
8,0,Utility,2,14943


In [39]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,21,329001
1,0,Entertainment/public assembly,8,138978
2,0,Lodging/residential,16,270350
3,0,Office,9,154109
4,0,Public services,4,69571
5,0,Religious worship,1,17398
6,0,Technology/science,1,17395
7,0,Utility,1,15796


In [40]:
_train_data = train_data.query("site_id==15 & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site15.csv", index=False)

_leak =  leak.query("site_id==15 & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site15.csv", index=False)

In [41]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,23,168142
1,1,Entertainment/public assembly,7,47931
2,1,Healthcare,1,7332
3,1,Lodging/residential,13,89054
4,1,Manufacturing/industrial,1,6677
5,1,Office,16,116739
6,1,Public services,2,14417
7,1,Religious worship,1,7327
8,1,Technology/science,1,7332


In [42]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,18,301942
1,1,Entertainment/public assembly,4,67026
2,1,Healthcare,1,16886
3,1,Lodging/residential,8,134147
4,1,Office,10,165071
5,1,Public services,2,33710


In [43]:
_train_data = train_data.query("site_id==15 & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site15.csv", index=False)

_leak =  leak.query("site_id==15 & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site15.csv", index=False)

In [44]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,29,206114
1,2,Entertainment/public assembly,9,65089
2,2,Lodging/residential,12,80949
3,2,Manufacturing/industrial,1,7468
4,2,Office,12,89662
5,2,Public services,3,22413
6,2,Technology/science,1,7472
7,2,Utility,2,14944


In [45]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,22,374185
1,2,Entertainment/public assembly,5,86436
2,2,Lodging/residential,6,102819
3,2,Manufacturing/industrial,1,17395
4,2,Office,6,104381
5,2,Technology/science,1,17398
6,2,Utility,2,25068


In [46]:
_train_data = train_data.query("site_id==15 & meter==3")
_leak =  leak.query("site_id==15 & meter==3")

In [47]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,1,7332
1,3,Entertainment/public assembly,1,7465


In [48]:
validation(_leak)

Unnamed: 0,index,n_buildings,n_rows


***
## Mirrors datasets

In [49]:
summary = (train_data.groupby(["site_id","meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

function to find mirror buildings between sites (for a given meter). Finds a set of #buildings_source buildings in target that best represent the set of buildings in source 

In [50]:
def find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target):
    all_selected = dict()
    primary_uses = building_metadata.query("building_id in @bld_id_source").primary_use.unique()
    for primary_use in primary_uses:
        blds_src = building_metadata.query("building_id in @bld_id_source & primary_use==@primary_use").sort_values("square_feet", ascending=False)
        blds_tgt = building_metadata.query("building_id in @bld_id_target & primary_use==@primary_use").sort_values("square_feet", ascending=False)
        for i,rowx in blds_src.iterrows():
            if len(blds_tgt)==0:
                print(f"no more buildings on target with primary_use: {primary_use}")
                break
            dist = np.inf
            selected = -1
            for j,rowy in blds_tgt.iterrows():
                if np.abs(rowx.square_feet - rowy.square_feet) < dist:
                    dist = np.abs(rowx.square_feet - rowy.square_feet)
                    selected = j
            all_selected[rowx.building_id] = blds_tgt.loc[selected].building_id
            blds_tgt.drop(selected, axis=0, inplace=True)
    return all_selected

In [51]:
def find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, sites_target, meter):
    bld_id_source = list(bld_id_source)
    mirrors = dict()
    for site_target in sites_target:
        bld_id_target = train_data.query("site_id==@site_target & meter==@meter").building_id.unique().tolist()
        mirrors = {**mirrors, **find_mirror_buildings(train_data, building_metadata, bld_id_source, bld_id_target)}
        bld_id_source = [bld_id for bld_id in bld_id_source if bld_id not in mirrors.keys()]
        if len(bld_id_source) == 0: break
    return mirrors

In [52]:
def validation(data):
    return (data.groupby(["meter","primary_use"]).agg(
      n_buildings=("building_id", lambda x: len(x.unique())),
      n_rows=("timestamp", lambda x: len(x)))
      .reset_index())

***
#### mirror dataset for `site_id == 1 & meter==3`

In [53]:
summary.query("site_id == 1 & meter == 3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
17,1,3,Education,8,70245
18,1,3,Office,2,17567
19,1,3,Public services,2,17563


In [54]:
bld_id_source = train_data.query("site_id==1 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=3)

no more buildings on target with primary_use: Public services


In [55]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site1.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site1.csv", index=False)

In [56]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,8,70063
1,3,Office,2,17563
2,3,Public services,1,8781


In [57]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,8,210223
1,3,Office,2,52603
2,3,Public services,1,26301


***
#### mirror dataset for `site_id == 15 & meter == 3` 

In [58]:
summary.query("site_id == 15 & meter == 3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
219,15,3,Education,1,7332
220,15,3,Entertainment/public assembly,1,7465


In [59]:
bld_id_source = train_data.query("site_id==15 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=3)

In [60]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site15.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site15.csv", index=False)

In [61]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,1,8782
1,3,Entertainment/public assembly,1,8782


In [62]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,1,26302
1,3,Entertainment/public assembly,1,26302


***
#### mirror dataset for `site_id == 3 & meter==0`

In [63]:
summary.query("site_id == 3 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
47,3,0,Education,92,782104
48,3,0,Entertainment/public assembly,44,385016
49,3,0,Healthcare,6,52617
50,3,0,Lodging/residential,11,95871
51,3,0,Office,23,200005
52,3,0,Other,4,34857
53,3,0,Parking,1,8776
54,3,0,Public services,86,742136
55,3,0,Religious worship,1,8782
56,3,0,Retail,1,8782


In [64]:
bld_id_source = train_data.query("site_id==3 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,1,0,4,15], meter=0)

no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more

In [65]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site3.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site3.csv", index=False)

In [66]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,92,769971
1,0,Entertainment/public assembly,44,335713
2,0,Healthcare,1,8745
3,0,Lodging/residential,11,96576
4,0,Office,23,201480
5,0,Other,4,21623
6,0,Parking,1,8782
7,0,Public services,20,165655
8,0,Religious worship,1,8022
9,0,Retail,1,8776


In [67]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,91,2329151
1,0,Entertainment/public assembly,37,913332
2,0,Healthcare,1,26261
3,0,Lodging/residential,11,289013
4,0,Office,23,596240
5,0,Other,4,91615
6,0,Parking,1,26300
7,0,Public services,18,427948
8,0,Religious worship,1,25094
9,0,Retail,1,26267


***
#### mirror dataset for `site_id == 5 & meter == 0`

In [71]:
summary.query("site_id==5 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
65,5,0,Education,49,427493
66,5,0,Entertainment/public assembly,18,157347
67,5,0,Healthcare,1,8784
68,5,0,Lodging/residential,1,8737
69,5,0,Manufacturing/industrial,3,26329
70,5,0,Office,11,95493
71,5,0,Other,1,8738
72,5,0,Public services,5,40832


In [72]:
bld_id_source = train_data.query("site_id==5 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2, 15, 0], meter=0)

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Manufacturing/industrial


In [73]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site5.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site5.csv", index=False)

In [74]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,49,422718
1,0,Entertainment/public assembly,18,156906
2,0,Healthcare,1,8745
3,0,Lodging/residential,1,8781
4,0,Manufacturing/industrial,2,14944
5,0,Office,11,96570
6,0,Other,1,5406
7,0,Public services,5,43886


In [75]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,48,1246388
1,0,Entertainment/public assembly,18,469124
2,0,Healthcare,1,26261
3,0,Lodging/residential,1,26300
4,0,Office,11,283742
5,0,Other,1,22911
6,0,Public services,5,130179


***
#### mirror dataset for `site_id == 14 & meter == 0`

In [77]:
summary.query("site_id==14 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
165,14,0,Education,26,212273
166,14,0,Entertainment/public assembly,10,84836
167,14,0,Food sales and service,2,16820
168,14,0,Healthcare,10,81820
169,14,0,Lodging/residential,9,77275
170,14,0,Office,38,317928
171,14,0,Public services,7,59278


In [78]:
bld_id_source = train_data.query("site_id==14 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2, 0, 1, 4, 15], meter=0)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Healthcare


In [79]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site14.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site14.csv", index=False)

In [80]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,26,225480
1,0,Entertainment/public assembly,10,87205
2,0,Food sales and service,2,17563
3,0,Healthcare,1,8745
4,0,Lodging/residential,9,79019
5,0,Office,38,285933
6,0,Public services,7,61444


In [81]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,26,676782
1,0,Entertainment/public assembly,10,260023
2,0,Food sales and service,2,52562
3,0,Healthcare,1,26261
4,0,Lodging/residential,9,236441
5,0,Office,38,930226
6,0,Public services,7,178389


***
#### mirror dataset for `site_id == 14 & meter == 1`

In [82]:
summary.query("site_id==14 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
172,14,1,Education,23,197297
173,14,1,Entertainment/public assembly,7,61150
174,14,1,Food sales and service,2,17568
175,14,1,Healthcare,8,68857
176,14,1,Lodging/residential,8,70272
177,14,1,Office,31,266477
178,14,1,Public services,7,61396


In [83]:
bld_id_source = train_data.query("site_id==14 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2, 0, 15], meter=1)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare


In [84]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site14.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site14.csv", index=False)

In [85]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,23,201978
1,1,Entertainment/public assembly,7,61461
2,1,Food sales and service,2,17564
3,1,Healthcare,2,16115
4,1,Lodging/residential,8,70253
5,1,Office,31,246600
6,1,Public services,6,49543


In [86]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,23,604938
1,1,Entertainment/public assembly,7,184101
2,1,Food sales and service,2,52604
3,1,Healthcare,2,43189
4,1,Lodging/residential,8,210413
5,1,Office,29,690759
6,1,Public services,6,138916


***
#### mirror dataset for `site_id == 14 & meter == 2`

In [87]:
summary.query("site_id==14 & meter == 2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
179,14,2,Education,15,130767
180,14,2,Entertainment/public assembly,1,8418
181,14,2,Food sales and service,1,8784
182,14,2,Healthcare,6,51326
183,14,2,Lodging/residential,6,49324
184,14,2,Office,13,114054
185,14,2,Public services,1,8784


In [88]:
bld_id_source = train_data.query("site_id==14 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[15], meter=2)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Food sales and service


In [89]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site14.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site14.csv", index=False)

In [90]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,15,102189
1,2,Entertainment/public assembly,1,7460
2,2,Lodging/residential,6,38821
3,2,Office,12,89662
4,2,Public services,1,7472


In [91]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,12,204756
1,2,Entertainment/public assembly,1,17398
2,2,Lodging/residential,2,34784
3,2,Office,6,104381


***
#### mirror dataset for `site_id == 14 & meter == 3`

In [92]:
summary.query("site_id==14 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
186,14,3,Education,13,110986
187,14,3,Entertainment/public assembly,7,61475
188,14,3,Food sales and service,1,8783
189,14,3,Healthcare,3,25345
190,14,3,Lodging/residential,2,17568
191,14,3,Office,25,219429
192,14,3,Public services,6,52678


In [93]:
bld_id_source = train_data.query("site_id==14 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,1,15], meter=3)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Healthcare


In [94]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site14.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site14.csv", index=False)

In [95]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,13,114116
1,3,Entertainment/public assembly,7,61473
2,3,Food sales and service,1,8781
3,3,Lodging/residential,2,17026
4,3,Office,8,70261
5,3,Public services,3,26344


In [96]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,13,341876
1,3,Entertainment/public assembly,7,184113
2,3,Food sales and service,1,26301
3,3,Lodging/residential,2,52066
4,3,Office,6,157814
5,3,Public services,1,26301


***
#### mirror dataset for `site_id == 6 & meter == 0`

In [100]:
summary.query("site_id==6 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
73,6,0,Education,13,113891
74,6,0,Entertainment/public assembly,3,26319
75,6,0,Lodging/residential,11,96438
76,6,0,Office,8,69971
77,6,0,Public services,1,8775


In [101]:
bld_id_source = train_data.query("site_id==6 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=0)

In [102]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site6.csv", index=False)

In [103]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,13,111263
1,0,Entertainment/public assembly,3,26339
2,0,Lodging/residential,11,96576
3,0,Office,8,70214
4,0,Public services,1,8774


In [104]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,13,334255
1,0,Entertainment/public assembly,3,78758
2,0,Lodging/residential,11,289013
3,0,Office,8,203775
4,0,Public services,1,21906


***
#### mirror dataset for `site_id = 6 & meter == 1`

In [105]:
summary.query("site_id==6 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
78,6,1,Education,8,59751
79,6,1,Entertainment/public assembly,2,16868
80,6,1,Lodging/residential,4,35099
81,6,1,Office,6,44902
82,6,1,Public services,1,7487


In [107]:
bld_id_source = train_data.query("site_id==6 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=1)

In [108]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site6.csv", index=False)

In [109]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,8,68085
1,1,Entertainment/public assembly,2,17566
2,1,Lodging/residential,4,35128
3,1,Office,6,49313
4,1,Public services,1,8781


In [110]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,8,208245
1,1,Entertainment/public assembly,2,52606
2,1,Lodging/residential,4,105208
3,1,Office,6,154433
4,1,Public services,1,26301


***
#### mirror dataset for `site_id = 6 & meter == 2`

In [111]:
summary.query("site_id == 6 & meter == 2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
83,6,2,Education,10,80780
84,6,2,Entertainment/public assembly,2,11427
85,6,2,Lodging/residential,6,52544
86,6,2,Office,5,43839


In [112]:
bld_id_source = train_data.query("site_id==6 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[15], meter=2)

In [113]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site6.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site6.csv", index=False)

In [114]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,10,74052
1,2,Entertainment/public assembly,2,14940
2,2,Lodging/residential,6,42099
3,2,Office,5,37360


In [115]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,7,118635
1,2,Entertainment/public assembly,1,16848
2,2,Lodging/residential,2,34765
3,2,Office,2,34794


***
#### mirror dataset for `site_id = 7 & meter == 0`

In [116]:
summary.query("site_id == 7 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
87,7,0,Education,12,92152


In [117]:
bld_id_source = train_data.query("site_id==7 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[4], meter=0)

In [118]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site7.csv", index=False)

In [119]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,12,102432


In [120]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,11,284378


***
#### mirror dataset for `site_id = 7 & meter == 1`

In [121]:
summary.query("site_id == 7 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
88,7,1,Education,15,130956


In [123]:
bld_id_source = train_data.query("site_id==7 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=1)

In [124]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site7.csv", index=False)

In [125]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,15,131728


In [126]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,15,394528


***
#### mirror dataset for `site_id = 7 & meter == 2` 

In [127]:
summary.query("site_id==7 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
89,7,2,Education,12,104597


In [128]:
bld_id_source = train_data.query("site_id==7 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[15], meter=2)

In [129]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site7.csv", index=False)

In [130]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,12,86635


In [131]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,10,173955


***
#### mirror dataset for `site_id = 7 & meter == 3` on `site_id == 2 & meter == 3`

In [132]:
summary.query("site_id == 7 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
90,7,3,Education,3,26344


In [133]:
bld_id_source = train_data.query("site_id==7 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=3)

In [134]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site7.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site7.csv", index=False)

In [135]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,3,26141


In [136]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,3,78701


***
#### mirror dataset for `site_id = 8 & meter == 0` 

In [137]:
summary.query("site_id==8 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
91,8,0,Entertainment/public assembly,24,172977
92,8,0,Office,7,53827
93,8,0,Other,9,71246
94,8,0,Public services,28,227229
95,8,0,Warehouse/storage,2,16054


In [138]:
bld_id_source = train_data.query("site_id==8 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[0,2,15,1,4], meter=0)

no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Public services


In [139]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site8.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site8.csv", index=False)

In [140]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Entertainment/public assembly,24,192371
1,0,Office,7,37838
2,0,Other,5,27029
3,0,Public services,20,165655
4,0,Warehouse/storage,2,17316


In [141]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Entertainment/public assembly,24,609596
1,0,Office,7,160376
2,0,Other,5,114272
3,0,Public services,18,427948
4,0,Warehouse/storage,2,51592


***
#### mirror dataset for `site_id = 9 & meter == 0` 

In [142]:
summary.query("site_id==9 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
96,9,0,Education,63,527525
97,9,0,Entertainment/public assembly,17,143242
98,9,0,Lodging/residential,19,162302
99,9,0,Office,16,131285
100,9,0,Public services,2,17118
101,9,0,Services,5,42490


In [143]:
bld_id_source = train_data.query("site_id==9 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,4], meter=0)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Lodging/residential


there are not buildings with primary_use='Services' in leak data. I will select 5 buildings from `site==4` & `primary_use="Public services"` to do the trick

In [144]:
summary.query("site_id==4 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
58,4,0,Education,66,553524
59,4,0,Entertainment/public assembly,9,62185
60,4,0,Lodging/residential,4,29673
61,4,0,Parking,3,20589
62,4,0,Public services,6,50602
63,4,0,Technology/science,2,17544
64,4,0,Utility,1,8750


In [145]:
extra_buildings = train_data.query("site_id==4 & meter==0 & primary_use=='Public services'").building_id.unique()[:5].tolist()

In [146]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==0")
_extra_train_data["primary_use"] = 'Services'
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter0_site9.csv", index=False)

_leak = leak.query("building_id in @mirrors.values() & meter==0")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==0")
_extra_leak_data["primary_use"] = 'Services'
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter0_site9.csv", index=False)

In [147]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,63,545603
1,0,Entertainment/public assembly,17,147966
2,0,Lodging/residential,16,135032
3,0,Office,16,140360
4,0,Public services,2,17557
5,0,Services,5,43914


In [148]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,62,1612141
1,0,Entertainment/public assembly,17,442790
2,0,Lodging/residential,13,341581
3,0,Office,16,414383
4,0,Public services,2,48205
5,0,Services,5,129330


***
#### mirror dataset for `site_id = 9 & meter == 1` 

In [149]:
summary.query("site_id==9 & meter == 1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
102,9,1,Education,55,482335
103,9,1,Entertainment/public assembly,14,122770
104,9,1,Lodging/residential,13,114005
105,9,1,Office,11,96468
106,9,1,Public services,2,17535


In [150]:
bld_id_source = train_data.query("site_id==9 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,0], meter=1)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly
no more buildings on target with primary_use: Entertainment/public assembly


In [151]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site9.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site9.csv", index=False)

In [152]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,55,476286
1,1,Entertainment/public assembly,9,79024
2,1,Lodging/residential,13,112379
3,1,Office,11,96602
4,1,Public services,2,17563


In [154]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,54,1413584
1,1,Entertainment/public assembly,9,236704
2,1,Lodging/residential,13,340139
3,1,Office,11,289322
4,1,Public services,2,52603


***
#### mirror dataset for `site_id = 9 & meter == 2`  **

In [155]:
summary.query("site_id==9 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
107,9,2,Education,53,465059
108,9,2,Entertainment/public assembly,13,114076
109,9,2,Lodging/residential,13,110559
110,9,2,Office,8,70219
111,9,2,Public services,2,17549


In [157]:
bld_id_source = train_data.query("site_id==9 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[15], meter=2)

no more buildings on target with primary_use: Lodging/residential
no more buildings on target with primary_use: Education
no more buildings on target with primary_use: Entertainment/public assembly


In [158]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site9.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site9.csv", index=False)

In [159]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,29,206114
1,2,Entertainment/public assembly,9,65089
2,2,Lodging/residential,12,80949
3,2,Office,8,59775
4,2,Public services,2,14941


In [160]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,22,374185
1,2,Entertainment/public assembly,5,86436
2,2,Lodging/residential,6,102819
3,2,Office,4,69586


***
#### mirror dataset for `site_id = 10 & meter == 0` 

In [163]:
summary.query("site_id==10 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
112,10,0,Education,14,109409
113,10,0,Entertainment/public assembly,4,34570
114,10,0,Lodging/residential,3,26305
115,10,0,Office,5,37169
116,10,0,Other,3,25933
117,10,0,Technology/science,1,2696


In [164]:
bld_id_source = train_data.query("site_id==10 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[4,2], meter=0)

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Other


there are no buildings with `primary_use="Other"` in leak data.

In [165]:
extra_buildings = train_data.query("site_id==4 & meter==0 & primary_use=='Parking'").building_id.unique().tolist()

In [166]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==0")
_extra_train_data["primary_use"] = 'Other'
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter0_site10.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==0")
_extra_leak_data["primary_use"] = 'Other'
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter0_site10.csv", index=False)

In [167]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,14,119670
1,0,Entertainment/public assembly,4,32062
2,0,Lodging/residential,3,20890
3,0,Office,5,43750
4,0,Other,3,20589
5,0,Technology/science,1,8775


In [169]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,10,262651
1,0,Entertainment/public assembly,4,102007
2,0,Office,5,129106
3,0,Other,2,46657
4,0,Technology/science,1,26304


***
#### mirror dataset for `site_id = 10 & meter == 1`

In [171]:
summary.query("site_id==10 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
118,10,1,Education,6,52634
119,10,1,Other,2,17527
120,10,1,Technology/science,1,8774


In [172]:
bld_id_source = train_data.query("site_id==10 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=1)

no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Technology/science


In [173]:
extra_buildings  = train_data.query("site_id==2 & meter==1 & primary_use in ('Healthcare', 'Food sales and service')").building_id.unique().tolist()

In [174]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==1")
_extra_train_data["primary_use"] = _extra_train_data.primary_use.map({"Healthcare":"Technology/science", "Food sales and service":"Other"})
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter1_site10.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==1")
_extra_leak_data["primary_use"] = _extra_leak_data.primary_use.map({"Healthcare":"Technology/science", "Food sales and service":"Other"})
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter1_site10.csv", index=False)

In [175]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,6,52696
1,1,Other,2,17564
2,1,Technology/science,1,8783


In [177]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,6,157816
1,1,Other,2,52604
2,1,Technology/science,1,26303


***
#### mirror dataset for `site_id = 10 & meter == 3` 

In [179]:
summary.query("site_id == 10 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
121,10,3,Education,9,78350
122,10,3,Entertainment/public assembly,1,8775
123,10,3,Technology/science,1,8774


In [180]:
bld_id_source = train_data.query("site_id==10 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=3)

no more buildings on target with primary_use: Technology/science


In [181]:
extra_buildings  = train_data.query("site_id==2 & meter==3 & primary_use=='Public services'").building_id.unique().tolist()

In [182]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_extra_train_data = train_data.query("building_id in @extra_buildings & meter==3")
_extra_train_data["primary_use"] = "Technology/science"
_train_data = pd.concat([_train_data, _extra_train_data])
_train_data.to_csv("mirrors/train_data_meter3_site10.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_extra_leak_data = leak.query("building_id in @extra_buildings & meter==3")
_extra_leak_data["primary_use"] = "Technology/science"
_leak = pd.concat([_leak, _extra_leak_data])
_leak.to_csv("mirrors/leak_data_meter3_site10.csv", index=False)

In [183]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,9,79023
1,3,Entertainment/public assembly,1,8783
2,3,Technology/science,1,8781


In [185]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,9,236703
1,3,Entertainment/public assembly,1,26303
2,3,Technology/science,1,26301


***
#### mirror dataset for `site_id = 11 & meter==0` 

In [187]:
summary.query("site_id==11 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
124,11,0,Education,5,43400


In [188]:
bld_id_source = train_data.query("site_id==11 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[4], meter=0)

In [189]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site11.csv", index=False)

In [190]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,5,43583


In [191]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,5,131198


***
#### mirror dataset for `site_id = 11 & meter==1` 

In [394]:
summary.query("site_id==11 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
125,11,1,Education,4,32656


In [192]:
bld_id_source = train_data.query("site_id==11 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=1)

In [193]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site11.csv", index=False)

In [194]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,4,35127


In [196]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,4,105207


***
#### mirror dataset for `site_id = 11 & meter==3` 

In [197]:
summary.query("site_id==11 & meter==3")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
126,11,3,Education,5,43403


In [198]:
bld_id_source = train_data.query("site_id==11 & meter==3").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2], meter=3)

In [199]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==3")
_train_data.to_csv("mirrors/train_data_meter3_site11.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==3")
_leak.to_csv("mirrors/leak_data_meter3_site11.csv", index=False)

In [200]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,5,43717


In [201]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,3,Education,5,131317


***
#### mirror dataset for `site_id = 12 & meter==0` 

In [202]:
summary.query("site_id==12 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
127,12,0,Education,20,174519
128,12,0,Entertainment/public assembly,2,17283
129,12,0,Office,9,79034
130,12,0,Public services,1,8784
131,12,0,Retail,3,26090
132,12,0,Technology/science,1,8779


In [203]:
bld_id_source = train_data.query("site_id==12 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[0,4], meter=0)

no more buildings on target with primary_use: Technology/science
no more buildings on target with primary_use: Public services


In [204]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site12.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site12.csv", index=False)

In [205]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,20,106115
1,0,Entertainment/public assembly,2,10806
2,0,Office,9,48651
3,0,Public services,1,8783
4,0,Retail,3,16216
5,0,Technology/science,1,8769


In [206]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,20,451718
1,0,Entertainment/public assembly,2,45816
2,0,Office,9,202669
3,0,Public services,1,26304
4,0,Retail,3,68750
5,0,Technology/science,1,26304


***
#### mirror dataset for `site_id = 13 & meter==0` 

In [207]:
summary.query("site_id==13 & meter==0")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
133,13,0,Education,23,196465
134,13,0,Entertainment/public assembly,6,52528
135,13,0,Food sales and service,1,8784
136,13,0,Healthcare,3,26321
137,13,0,Lodging/residential,10,87537
138,13,0,Manufacturing/industrial,5,43919
139,13,0,Office,70,599934
140,13,0,Other,3,26338
141,13,0,Parking,7,61286
142,13,0,Public services,5,43911


In [208]:
bld_id_source = train_data.query("site_id==13 & meter==0").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,4,0,1,15], meter=0)

no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Technology/science
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Other
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Manufacturing/industrial

In [209]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==0")
_train_data.to_csv("mirrors/train_data_meter0_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==0")
_leak.to_csv("mirrors/leak_data_meter0_site13.csv", index=False)

In [210]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,23,201707
1,0,Entertainment/public assembly,6,51584
2,0,Food sales and service,1,8782
3,0,Healthcare,1,8745
4,0,Lodging/residential,10,87795
5,0,Manufacturing/industrial,2,14944
6,0,Office,70,519887
7,0,Other,3,16217
8,0,Parking,7,52327
9,0,Public services,5,43894


In [212]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,0,Education,23,601182
1,0,Entertainment/public assembly,6,154117
2,0,Food sales and service,1,26270
3,0,Healthcare,1,26261
4,0,Lodging/residential,10,262713
5,0,Office,66,1595853
6,0,Other,3,68702
7,0,Parking,6,148409
8,0,Public services,5,125828
9,0,Technology/science,1,26304



***
#### mirror dataset for `site_id = 13 & meter==1` 

In [213]:
summary.query("site_id==13 & meter==1")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
146,13,1,Education,18,158021
147,13,1,Entertainment/public assembly,5,43920
148,13,1,Healthcare,3,26352
149,13,1,Lodging/residential,2,17568
150,13,1,Office,46,404049
151,13,1,Parking,1,8781
152,13,1,Public services,5,43916


In [214]:
bld_id_source = train_data.query("site_id==13 & meter==1").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[2,0,15], meter=1)

no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Healthcare


In [215]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==1")
_train_data.to_csv("mirrors/train_data_meter1_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==1")
_leak.to_csv("mirrors/leak_data_meter1_site13.csv", index=False)

In [216]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,18,155905
1,1,Entertainment/public assembly,5,43896
2,1,Healthcare,2,16115
3,1,Lodging/residential,2,17564
4,1,Office,39,305203
5,1,Public services,5,42457


In [217]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,1,Education,18,471265
1,1,Entertainment/public assembly,5,131496
2,1,Healthcare,2,43189
3,1,Lodging/residential,2,52604
4,1,Office,33,756495
5,1,Public services,5,122052


***
#### mirror dataset for `site_id = 13 & meter==2` **

In [218]:
summary.query("site_id==13 & meter==2")

Unnamed: 0,site_id,meter,primary_use,n_buildings,n_rows
153,13,2,Education,17,149202
154,13,2,Entertainment/public assembly,5,43761
155,13,2,Food sales and service,1,8784
156,13,2,Healthcare,3,26187
157,13,2,Lodging/residential,8,70272
158,13,2,Manufacturing/industrial,3,26352
159,13,2,Office,41,360098
160,13,2,Other,1,8784
161,13,2,Parking,2,17568
162,13,2,Public services,4,35136


In [222]:
bld_id_source = train_data.query("site_id==13 & meter==2").building_id.unique().tolist()
mirrors = find_mirror_buildings_onall(train_data, building_metadata, bld_id_source, 
                                      sites_target=[15], meter=2)

no more buildings on target with primary_use: Healthcare
no more buildings on target with primary_use: Office
no more buildings on target with primary_use: Public services
no more buildings on target with primary_use: Food sales and service
no more buildings on target with primary_use: Parking
no more buildings on target with primary_use: Services
no more buildings on target with primary_use: Manufacturing/industrial
no more buildings on target with primary_use: Warehouse/storage
no more buildings on target with primary_use: Other


In [223]:
_train_data = train_data.query("building_id in @mirrors.values() & meter==2")
_train_data.to_csv("mirrors/train_data_meter2_site13.csv", index=False)

_leak =  leak.query("building_id in @mirrors.values() & meter==2")
_leak.to_csv("mirrors/leak_data_meter2_site13.csv", index=False)

In [224]:
validation(_train_data)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,17,116477
1,2,Entertainment/public assembly,5,35210
2,2,Lodging/residential,8,55689
3,2,Manufacturing/industrial,1,7468
4,2,Office,12,89662
5,2,Public services,3,22413


In [225]:
validation(_leak)

Unnamed: 0,meter,primary_use,n_buildings,n_rows
0,2,Education,13,226133
1,2,Entertainment/public assembly,2,34794
2,2,Lodging/residential,4,69558
3,2,Manufacturing/industrial,1,17395
4,2,Office,6,104381


***

small verification that all mirror files exist

In [241]:
import os

In [242]:
for i,row in train_data.loc[:, ["site_id","meter"]].drop_duplicates().iterrows():
    if not os.path.exists(f"mirrors/leak_data_meter{row.meter}_site{row.site_id}.csv"):
        print(f"mirrors/leak_data_meter{row.meter}_site{row.site_id}.csv")
    if not os.path.exists(f"mirrors/train_data_meter{row.meter}_site{row.site_id}.csv"):
        print(f"mirrors/train_data_meter{row.meter}_site{row.site_id}.csv")

***