In [5]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from scripts.utils import reduce_mem_usage
from scripts.anomaly import anomaly_detector
from sklearn.model_selection import StratifiedKFold
import h5py
import ghalton
from scipy.stats import rankdata
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import pickle
from sklearn.preprocessing import RobustScaler

NO_WEATHER = False

***

function to perform correction of weather data to localtime

In [6]:
sites_time_corrections = {0:4, 1:0, 2:7, 3:4, 4:7, 5:0, 6:4, 7:4, 8:4,
                          9:5, 10:7, 11:4, 12:0, 13:5, 14:4, 15:4}

def set_localtime(data):
    all_ts = list()
    tidx = pd.date_range(data.timestamp.min(), data.timestamp.max(), freq="H")
    for site_id, hdiff in sites_time_corrections.items():
        ts = (data.query("site_id == @site_id")
              .assign(timestamp = lambda x: x.timestamp - pd.offsets.Hour(hdiff))
              .set_index("timestamp")
              .reindex(tidx)
              .assign(site_id = site_id))
        all_ts.append(ts)
    return pd.concat(all_ts).reset_index().rename({"index":"timestamp"}, axis=1)

function to fill missing values in weather dataframes

In [7]:
def weather_imputation(weather_data):
    weather_data = (weather_data
        .set_index("timestamp")
        .groupby('site_id')
        .apply(lambda group: group.interpolate(method="time", limit_direction='both'))
        .reset_index())
    return weather_data

***
## train data

In [8]:
train = pd.read_csv("data/train.csv", parse_dates=["timestamp"])

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
building_id      int64
meter            int64
timestamp        datetime64[ns]
meter_reading    float64
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 616.9 MB


***

In [10]:
building_metadata = pd.read_csv("data/building_metadata.csv")

In [11]:
building_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
site_id        1449 non-null int64
building_id    1449 non-null int64
primary_use    1449 non-null object
square_feet    1449 non-null int64
year_built     675 non-null float64
floor_count    355 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 68.0+ KB


In [12]:
print(f"Missing values in year_built: {np.sum(building_metadata.year_built.isna())}")

Missing values in year_built: 774


In [13]:
print(f"Missing values in floor_count: {np.sum(building_metadata.floor_count.isna())}")

Missing values in floor_count: 1094


In [14]:
building_metadata.drop(["year_built","floor_count"], axis=1, inplace=True)

***

In [15]:
weather_train = pd.read_csv("data/weather_train.csv", parse_dates=["timestamp"])

In [16]:
weather_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
site_id               139773 non-null int64
timestamp             139773 non-null datetime64[ns]
air_temperature       139718 non-null float64
cloud_coverage        70600 non-null float64
dew_temperature       139660 non-null float64
precip_depth_1_hr     89484 non-null float64
sea_level_pressure    129155 non-null float64
wind_direction        133505 non-null float64
wind_speed            139469 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 9.6 MB


In [17]:
weather_train = set_localtime(weather_train)

***

In [18]:
100*weather_train.isna().sum()/weather_train.shape[0]

timestamp              0.000000
site_id                0.000000
air_temperature        0.628273
cloud_coverage        49.787967
dew_temperature        0.669541
precip_depth_1_hr     36.359432
sea_level_pressure     8.141223
wind_direction         5.047530
wind_speed             0.805442
dtype: float64

In [19]:
weather_train.groupby('site_id').apply(lambda group: group.isna().sum())

Unnamed: 0_level_0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,7,3833,7,4,89,254,4
1,0,0,22,7083,22,8784,73,24,21
2,0,0,9,2361,9,73,52,599,13
3,0,0,12,3648,14,44,166,161,12
4,0,0,7,4237,9,1324,80,112,7
5,0,0,31,6059,31,8784,8784,324,32
6,0,0,15,2995,15,5,179,784,44
7,0,0,174,8784,197,8053,182,174,174
8,0,0,7,3833,7,4,89,254,4
9,0,0,13,3462,15,15,247,2565,105


In [20]:
weather_train.groupby('site_id').apply(lambda group: 100*group.isna().sum()/group.shape[0])

Unnamed: 0_level_0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.0,0.0,0.07969,43.636157,0.07969,0.045537,1.013206,2.891621,0.045537
1,0.0,0.0,0.250455,80.635246,0.250455,100.0,0.831056,0.273224,0.239071
2,0.0,0.0,0.102459,26.878415,0.102459,0.831056,0.591985,6.819217,0.147996
3,0.0,0.0,0.136612,41.530055,0.159381,0.500911,1.8898,1.832878,0.136612
4,0.0,0.0,0.07969,48.235428,0.102459,15.07286,0.910747,1.275046,0.07969
5,0.0,0.0,0.352914,68.977687,0.352914,100.0,100.0,3.688525,0.364299
6,0.0,0.0,0.170765,34.096084,0.170765,0.056922,2.037796,8.925319,0.500911
7,0.0,0.0,1.980874,100.0,2.242714,91.678051,2.071949,1.980874,1.980874
8,0.0,0.0,0.07969,43.636157,0.07969,0.045537,1.013206,2.891621,0.045537
9,0.0,0.0,0.147996,39.412568,0.170765,0.170765,2.811931,29.20082,1.195355


In [21]:
weather_train.drop(["cloud_coverage", "precip_depth_1_hr"], axis=1, inplace=True)

In [22]:
weather_train = weather_imputation(weather_train)

In [23]:
weather_train.groupby('site_id').apply(lambda group: group.isna().sum())

Unnamed: 0_level_0,timestamp,site_id,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,0,0,8784,0,0
6,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0


In [24]:
weather_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140544 entries, 0 to 140543
Data columns (total 7 columns):
timestamp             140544 non-null datetime64[ns]
site_id               140544 non-null int64
air_temperature       140544 non-null float64
dew_temperature       140544 non-null float64
sea_level_pressure    131760 non-null float64
wind_direction        140544 non-null float64
wind_speed            140544 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 7.5 MB


***
weather feature normalization

In [None]:
features = ["air_temperature", "dew_temperature", "sea_level_pressure", "wind_speed"]

for site_id in weather_train.site_id.unique():
    weather_cut = weather_train.query("site_id == @site_id")
    idx_cut = weather_cut.index
    scaler = RobustScaler()
    weather_train.loc[idx_cut, features] = scaler.fit_transform(weather_cut.loc[:, features].values)

***
merges the datasets

In [25]:
if NO_WEATHER:
    train_data = pd.merge(train, building_metadata, how="left", on=["building_id"])
else:
    train_data = (pd.merge(train, building_metadata, how="left", on=["building_id"])
                  .merge(weather_train, how="left", on=["timestamp","site_id"]))

***
removes anomal data for `site_id=0`

In [26]:
train_data_cut = train_data.query("site_id == 0")
ts_uid_values = train_data_cut.loc[:, ["building_id", "meter"]].drop_duplicates()
delete_idx = pd.Index(np.array([], dtype=int))
for i,row in ts_uid_values.iterrows():
    ts = train_data_cut.query("building_id == @row.building_id & meter == @row.meter")
    if ts.query("timestamp < '2016-05-20 18:00:00'").meter_reading.quantile(0.75) == 0:
        delete_idx = delete_idx.union(ts.query("timestamp < '2016-05-20 18:00:00'").index)  
        
train_data.drop(delete_idx, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [27]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19871167 entries, 0 to 19871166
Data columns (total 12 columns):
building_id           int64
meter                 int64
timestamp             datetime64[ns]
meter_reading         float64
site_id               int64
primary_use           object
square_feet           int64
air_temperature       float64
dew_temperature       float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
dtypes: datetime64[ns](1), float64(6), int64(4), object(1)
memory usage: 1.8+ GB


***
target correction for `site_id = 0`

* kBTU -> kWh (* 0.2931)

In [28]:
idx = train_data.query("site_id == 0 & meter == 0").index
train_data.loc[idx, "meter_reading"] = 0.2931*train_data.loc[idx, "meter_reading"]

***
removes anormal behavior for `building_id=363` before `2016-07-25`

In [29]:
idx = train_data.query("building_id == 363 & meter == 0 & timestamp <= '2016-07-24 23:00:00'").index
train_data.drop(idx, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [30]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19866224 entries, 0 to 19866223
Data columns (total 12 columns):
building_id           int64
meter                 int64
timestamp             datetime64[ns]
meter_reading         float64
site_id               int64
primary_use           object
square_feet           int64
air_temperature       float64
dew_temperature       float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
dtypes: datetime64[ns](1), float64(6), int64(4), object(1)
memory usage: 1.8+ GB


***
removes rows with `meter_reading=0` and `meter=0` 

In [31]:
idx = train_data.query("meter==0 and meter_reading==0").index
print(f"Number of outliers: {len(idx)}")
train_data.drop(idx, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)

Number of outliers: 187520


*** 
target correction an cleaning for `building==1099 & meter==2`. Correction factor scales the meter_reading of building 1099 to a range of values similar to the ones of buildings in the same site and with same primary_use (education).

In [32]:
idx = train_data.query("building_id==1099 & meter==2 & timestamp.dt.month==11 & meter_reading > 1e5").index
train_data.loc[idx, "meter_reading"] = train_data.query("building_id==1099 & meter==2 & timestamp.dt.month==11").meter_reading.median()

In [35]:
correction_factor = 5e4/2.190470e7
idx = train_data.query("building_id==1099 & meter==2").index
train_data.loc[idx, "meter_reading"] = correction_factor*train_data.loc[idx, "meter_reading"]

In [40]:
train_data.loc[idx, "meter_reading"].describe()

count     8784.000000
mean      8512.997910
std      14202.913680
min          0.328697
25%          3.670445
50%         15.503522
75%      14254.161207
max      50000.000000
Name: meter_reading, dtype: float64

***
### outlier removal

In [None]:
ts_uid_values = train_data.loc[:, ["building_id","meter"]].drop_duplicates()
outliers_indexes = list()

for _,row in ts_uid_values.iterrows():
    ts = train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts = ts.reset_index()
    outliers = anomaly_detector(ts.meter_reading, window_size=48, sigma=5)
    if len(outliers) > 0:
        print(f"{len(outliers)} found for building_id=={row.building_id} & meter={row.meter}")
    outliers_indexes.append(ts.loc[[idx for idx,_ in outliers], :].index.values)

***

In [29]:
train_data = reduce_mem_usage(train_data)

Mem. usage decreased to 713.15 Mb (60.4% reduction)


In [30]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19678704 entries, 0 to 19678703
Data columns (total 12 columns):
building_id           int16
meter                 int8
timestamp             datetime64[ns]
meter_reading         float32
site_id               int8
primary_use           object
square_feet           int32
air_temperature       float16
dew_temperature       float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtypes: datetime64[ns](1), float16(5), float32(1), int16(1), int32(1), int8(2), object(1)
memory usage: 713.1+ MB


In [30]:
if NO_WEATHER:
    train_data.to_hdf("data/train_data_nw.h5", key="train_data")
else:
    train_data.to_hdf("data/train_data.h5", key="train_data")
    #train_data.to_hdf("data/train_data.h5", key="train_data_scaled_weather")

***
## test data

In [31]:
test = pd.read_csv("data/test.csv", parse_dates=["timestamp"])

In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
row_id         int64
building_id    int64
meter          int64
timestamp      datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.2 GB


***

In [33]:
weather_test = pd.read_csv("data/weather_test.csv", parse_dates=["timestamp"])

In [34]:
weather_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277243 entries, 0 to 277242
Data columns (total 9 columns):
site_id               277243 non-null int64
timestamp             277243 non-null datetime64[ns]
air_temperature       277139 non-null float64
cloud_coverage        136795 non-null float64
dew_temperature       276916 non-null float64
precip_depth_1_hr     181655 non-null float64
sea_level_pressure    255978 non-null float64
wind_direction        264873 non-null float64
wind_speed            276783 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 19.0 MB


In [35]:
weather_test = set_localtime(weather_test)

In [36]:
weather_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280320 entries, 0 to 280319
Data columns (total 9 columns):
timestamp             280320 non-null datetime64[ns]
site_id               280320 non-null int64
air_temperature       277082 non-null float64
cloud_coverage        136771 non-null float64
dew_temperature       276859 non-null float64
precip_depth_1_hr     181603 non-null float64
sea_level_pressure    255921 non-null float64
wind_direction        264816 non-null float64
wind_speed            276726 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 19.2 MB


***

In [37]:
100*weather_test.isna().sum()/weather_test.shape[0]

timestamp              0.000000
site_id                0.000000
air_temperature        1.155108
cloud_coverage        51.208975
dew_temperature        1.234660
precip_depth_1_hr     35.215825
sea_level_pressure     8.703981
wind_direction         5.530822
wind_speed             1.282106
dtype: float64

In [38]:
weather_test.groupby('site_id').apply(lambda group: 100*group.isna().sum()/group.shape[0])

Unnamed: 0_level_0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.0,0.0,0.022831,43.076484,0.022831,0.188356,1.78653,2.745434,0.034247
1,0.0,0.0,1.455479,79.372146,1.455479,100.0,1.700913,1.358447,1.329909
2,0.0,0.0,0.039954,31.090183,0.05137,0.15411,0.228311,6.849315,0.091324
3,0.0,0.0,0.05137,44.840183,0.057078,0.285388,2.186073,1.478311,0.085616
4,0.0,0.0,0.057078,45.890411,0.074201,1.666667,0.736301,1.221461,0.057078
5,0.0,0.0,1.592466,66.347032,1.598174,100.0,100.0,4.646119,1.598174
6,0.0,0.0,0.131279,37.01484,0.131279,0.359589,2.328767,8.167808,0.399543
7,0.0,0.0,5.428082,100.0,5.719178,89.765982,5.667808,5.428082,5.428082
8,0.0,0.0,0.022831,43.076484,0.022831,0.188356,1.78653,2.745434,0.034247
9,0.0,0.0,0.553653,44.223744,1.19863,0.605023,3.441781,31.073059,1.689498


In [39]:
weather_test.drop(["cloud_coverage", "precip_depth_1_hr"], axis=1, inplace=True)

In [40]:
weather_test = weather_imputation(weather_test)

In [41]:
weather_test.groupby('site_id').apply(lambda group: 100*group.isna().sum()/group.shape[0])

Unnamed: 0_level_0,timestamp,site_id,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,100.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***

In [42]:
features = ["air_temperature", "dew_temperature", "sea_level_pressure", "wind_speed"]

for site_id in weather_test.site_id.unique():
    weather_cut = weather_test.query("site_id == @site_id")
    idx_cut = weather_cut.index
    scaler = RobustScaler()
    weather_test.loc[idx_cut, features] = scaler.fit_transform(weather_cut.loc[:, features].values)

  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  overwrite_input=overwrite_input, interpolation=interpolation


***

In [43]:
if NO_WEATHER:
    test_data = pd.merge(test, building_metadata, how="left", on=["building_id"])
else: 
    test_data = (pd.merge(test, building_metadata, how="left", on=["building_id"])
                 .merge(weather_test, how="left", on=["timestamp","site_id"]))

In [44]:
test_data = reduce_mem_usage(test_data)

Mem. usage decreased to 1829.23 Mb (55.8% reduction)


In [45]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 12 columns):
row_id                int32
building_id           int16
meter                 int8
timestamp             datetime64[ns]
site_id               int8
primary_use           object
square_feet           int32
air_temperature       float16
dew_temperature       float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtypes: datetime64[ns](1), float16(5), int16(1), int32(2), int8(2), object(1)
memory usage: 1.8+ GB


In [45]:
if NO_WEATHER:
    test_data.to_hdf("data/test_data_nw.h5", key="test_data") 
else:
    test_data.to_hdf("data/test_data.h5", key="test_data")
    #test_data.to_hdf("data/test_data.h5", key="test_data_scaled_weather")

verify that row_id column is sorted in `test_data`

In [46]:
np.all(test_data.row_id.diff(1).fillna(1) == 1)

True

***
## Leakage data

In [46]:
leak = (pd.read_feather("data/leak.feather")
        .drop_duplicates())
leak.meter = leak.meter.astype(int)

In [47]:
buildings_site0 = building_metadata.query("site_id == 0").building_id.unique()
idx = leak.query("building_id in @buildings_site0").index
leak.loc[idx, "meter_reading"] = 0.2931*leak.loc[idx, "meter_reading"]

In [48]:
idx = leak.query("meter==0 and meter_reading==0").index
print(f"Number of outliers: {len(idx)}")
leak.drop(idx, axis=0, inplace=True)
leak.reset_index(drop=True, inplace=True)

Number of outliers: 500473


In [49]:
leak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16115767 entries, 0 to 16115766
Data columns (total 4 columns):
building_id      int64
meter            int64
meter_reading    float64
timestamp        datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 491.8 MB


In [50]:
weather_all = pd.concat([weather_train, weather_test])

if NO_WEATHER:
    leak = pd.merge(leak, building_metadata, how="left", on=["building_id"])
else: 
    leak = (pd.merge(leak, building_metadata, how="left", on=["building_id"])
            .merge(weather_all, how="left", on=["timestamp","site_id"]))

In [51]:
#leak.to_hdf("data/leak_data.h5", "leak_data")
leak.to_hdf("data/leak_data.h5", "leak_data_scaled_weather")

***
## Validation schemes 

### custom validation data - by week - with hausdorff sampling

In [30]:
n_folds = 4
_train_data = train_data.loc[:, ["building_id","meter","timestamp"]]
ts_uid_values = _train_data.loc[:, ["building_id","meter"]].drop_duplicates()

In [31]:
valid_indexes = [[] for i in range(n_folds)]
generator = ghalton.Halton(1)

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = _train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts["week"] = ts.timestamp.dt.week
    weeks = ts.week.unique()
    n_weeks = len(weeks)
    
    sequence = np.asarray(generator.get(n_weeks))[:,0]
    idx = rankdata(sequence).astype(int)-1

    for i,weeks_idx in enumerate(np.array_split(idx, n_folds)):
        weeks_by_fold = weeks[weeks_idx]
        valid_indexes[i].append(ts.query("week in @weeks_by_fold").index.values)

2380it [03:03, 12.95it/s]


In [33]:
h5f = h5py.File("data/valid_sm_custom_4fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

***
### custom validation data - by week - with shuffle

In [45]:
np.random.seed(23)
n_folds = 4
_train_data = train_data.loc[:, ["building_id","meter","timestamp"]]
ts_uid_values = _train_data.loc[:, ["building_id","meter"]].drop_duplicates()

In [46]:
valid_indexes = [[] for i in range(n_folds)]

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = _train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts["week"] = ts.timestamp.dt.week
    weeks = ts.week.unique()
    np.random.shuffle(weeks)
    weeks_split = np.array_split(weeks, n_folds)
    np.random.shuffle(weeks_split)
    
    for i,weeks_by_fold in enumerate(weeks_split):
        valid_indexes[i].append(ts.query("week in @weeks_by_fold").index.values)

2380it [04:49,  8.22it/s]


In [47]:
h5f = h5py.File("data/valid_sm_custom1_4fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

***
### custom validation data - by day of year - with shuffle

In [86]:
np.random.seed(23)
n_folds = 4
_train_data = train_data.loc[:, ["building_id","meter","timestamp"]]
ts_uid_values = _train_data.loc[:, ["building_id","meter"]].drop_duplicates()

In [87]:
valid_indexes = [[] for i in range(n_folds)]

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = _train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts["dayofyear"] = ts.timestamp.dt.dayofyear
    days = ts.dayofyear.unique()
    np.random.shuffle(days)
    days_split = np.array_split(days, n_folds)
    np.random.shuffle(days_split)
    
    for i,days_by_fold in enumerate(days_split):
        valid_indexes[i].append(ts.query("dayofyear in @days_by_fold").index.values)

2380it [03:49, 10.35it/s]


In [88]:
h5f = h5py.File("data/valid_sm_custom2_4fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

***
### custom validation data - by day of year - stratified by month - with shuffle

In [46]:
np.random.seed(19)
n_folds = 3
_train_data = train_data.loc[:, ["building_id","meter","timestamp"]]
ts_uid_values = _train_data.loc[:, ["building_id","meter"]].drop_duplicates()

In [82]:
valid_indexes = [[] for i in range(n_folds)]

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = _train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts["dayofyear"] = ts.timestamp.dt.dayofyear
    ts["month"] = ts.timestamp.dt.month
    
    for month in ts.month.unique():
        ts_cut = ts.query("month == @month")
        days = ts_cut.dayofyear.unique()
        np.random.shuffle(days)
        days_split = np.array_split(days, n_folds)
        np.random.shuffle(days_split)
        
        for i,days_by_fold in enumerate(days_split):
            valid_indexes[i].append(ts_cut.query("dayofyear in @days_by_fold").index.values)

2380it [08:46,  4.52it/s]


In [83]:
h5f = h5py.File(f"data/valid_sm_custom_{n_folds}fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

***
### custom validation data - stratified by month - spliting by dayofyear with shuffle

In [47]:
np.random.seed(19)
n_folds = 3
_train_data = train_data.copy(deep=True)
_train_data["year_day"] = _train_data.timestamp.dt.dayofyear
_train_data["month"] = _train_data.timestamp.dt.month

In [48]:
valid_indexes = [[] for i in range(n_folds)]

for month in _train_data.month.unique():
    _train_data_cut = _train_data.query("month == @month")
    days = _train_data_cut.year_day.unique()
    np.random.shuffle(days)
    days_split = np.array_split(days, n_folds)
    np.random.shuffle(days_split)
    
    for i,days_by_fold in enumerate(days_split):
        idx = _train_data_cut.query("year_day in @days_by_fold").index.values
        valid_indexes[i].append(idx)    

In [49]:
h5f = h5py.File(f"data/valid_sm_custom_{n_folds}fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

In [50]:
valid_indexes[0] = np.concatenate(valid_indexes[0])
valid_indexes[1] = np.concatenate(valid_indexes[1])
valid_indexes[2] = np.concatenate(valid_indexes[2])

In [51]:
set(valid_indexes[1]) & set(valid_indexes[2])

set()

In [52]:
len(valid_indexes[0]) + len(valid_indexes[1]) + len(valid_indexes[2])

19866224

In [53]:
len(train_data)

19866224

***
### custom validation data - stratified by quarter - sampling by week with shuffle

In [136]:
np.random.seed(2)
n_folds = 3
_train_data = train_data.copy(deep=True)
_train_data["quarter"] = _train_data.timestamp.dt.quarter
_train_data["week"] = _train_data.timestamp.dt.week

In [137]:
valid_indexes = [[] for i in range(n_folds)]

for quarter in _train_data.quarter.unique():
    _train_data_cut = _train_data.query("quarter == @quarter")
    weeks = _train_data_cut.week.unique()
    np.random.shuffle(weeks)
    weeks_split = np.array_split(weeks, n_folds)
    np.random.shuffle(weeks_split)
    
    for i,weeks_by_fold in enumerate(weeks_split):
        idx = _train_data_cut.query("week in @weeks_by_fold").index.values
        valid_indexes[i].append(idx)

In [138]:
h5f = h5py.File(f"data/valid_sm_custom_{n_folds}fold.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=np.concatenate(valid_index))
h5f.close()

In [139]:
valid_indexes[0] = np.concatenate(valid_indexes[0])
valid_indexes[1] = np.concatenate(valid_indexes[1])
valid_indexes[2] = np.concatenate(valid_indexes[2])

In [140]:
len(valid_indexes[0])

6577467

In [141]:
len(valid_indexes[1])

6293506

In [142]:
len(valid_indexes[2])

6995251

***
## target scalers

In [27]:
ts_uid_values = (train_data
                 .loc[:, ["building_id","meter"]]
                 .drop_duplicates())
scaling_values = list()

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = train_data.query("building_id == @row.building_id & meter == @row.meter")
    scaler = RobustScaler(with_centering=True, with_scaling=True)
    scaler.fit(ts.meter_reading.values.reshape((-1,1)))
    scaling_values.append((row.building_id, row.meter, scaler.center_[0], scaler.scale_[0]))

2380it [05:03,  7.83it/s]


In [29]:
robust_scaler = pd.DataFrame(scaling_values, columns=["building_id", "meter", "center", "scale"])
robust_scaler.to_csv("data/robust_scaler.csv", index=False)

***