In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from scripts.utils import reduce_mem_usage
from sklearn.model_selection import StratifiedKFold
import h5py
import ghalton
from scipy.stats import rankdata

In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["timestamp"])

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
building_id      int64
meter            int64
timestamp        datetime64[ns]
meter_reading    float64
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 616.9 MB


In [4]:
building_metadata = pd.read_csv("data/building_metadata.csv")

In [5]:
building_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
site_id        1449 non-null int64
building_id    1449 non-null int64
primary_use    1449 non-null object
square_feet    1449 non-null int64
year_built     675 non-null float64
floor_count    355 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 68.0+ KB


In [6]:
print(f"Missing values in year_built: {np.sum(building_metadata.year_built.isna())}")

Missing values in year_built: 774


In [7]:
print(f"Missing values in floor_count: {np.sum(building_metadata.floor_count.isna())}")

Missing values in floor_count: 1094


In [8]:
building_metadata.drop(["year_built","floor_count"], axis=1, inplace=True)

In [9]:
weather_train = pd.read_csv("data/weather_train.csv", parse_dates=["timestamp"])

In [10]:
weather_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
site_id               139773 non-null int64
timestamp             139773 non-null datetime64[ns]
air_temperature       139718 non-null float64
cloud_coverage        70600 non-null float64
dew_temperature       139660 non-null float64
precip_depth_1_hr     89484 non-null float64
sea_level_pressure    129155 non-null float64
wind_direction        133505 non-null float64
wind_speed            139469 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 9.6 MB


In [11]:
print(f"Missing values in cloud_coverage: {weather_train.cloud_coverage.isna().sum()}")

Missing values in cloud_coverage: 69173


In [12]:
weather_train.drop(["cloud_coverage"], axis=1, inplace=True)

***
merges the datasets

In [13]:
train_data = (pd.merge(train, building_metadata, how="left", on=["building_id"])
              .merge(weather_train, how="left", on=["timestamp","site_id"]))

***
removes anomal behavior for `site_id=0`

In [14]:
train_data_cut = train_data.query("site_id == 0")
ts_uid_values = train_data_cut.loc[:, ["building_id", "meter"]].drop_duplicates()
delete_idx = pd.Index(np.array([], dtype=int))
for i,row in ts_uid_values.iterrows():
    ts = train_data_cut.query("building_id == @row.building_id & meter == @row.meter")
    if ts.query("timestamp < '2016-05-20 18:00:00'").meter_reading.quantile(0.75) == 0:
        delete_idx = delete_idx.union(ts.query("timestamp < '2016-05-20 18:00:00'").index)  
        
train_data.drop(delete_idx, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19871167 entries, 0 to 19871166
Data columns (total 13 columns):
building_id           int64
meter                 int64
timestamp             datetime64[ns]
meter_reading         float64
site_id               int64
primary_use           object
square_feet           int64
air_temperature       float64
dew_temperature       float64
precip_depth_1_hr     float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
dtypes: datetime64[ns](1), float64(7), int64(4), object(1)
memory usage: 1.9+ GB


***
removes anormal behavior for `building_id=363` before `2016-07-25`

In [16]:
idx = train_data.query("building_id == 363 & meter == 0 & timestamp <= '2016-07-24 23:00:00'").index
train_data.drop(idx, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19866224 entries, 0 to 19866223
Data columns (total 13 columns):
building_id           int64
meter                 int64
timestamp             datetime64[ns]
meter_reading         float64
site_id               int64
primary_use           object
square_feet           int64
air_temperature       float64
dew_temperature       float64
precip_depth_1_hr     float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
dtypes: datetime64[ns](1), float64(7), int64(4), object(1)
memory usage: 1.9+ GB


***

In [18]:
train_data = reduce_mem_usage(train_data)

Mem. usage decreased to 757.84 Mb (61.5% reduction)


In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19866224 entries, 0 to 19866223
Data columns (total 13 columns):
building_id           int16
meter                 int8
timestamp             datetime64[ns]
meter_reading         float32
site_id               int8
primary_use           object
square_feet           int32
air_temperature       float16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtypes: datetime64[ns](1), float16(6), float32(1), int16(1), int32(1), int8(2), object(1)
memory usage: 757.8+ MB


In [20]:
train_data.to_hdf("data/train_data.h5", key="train_data")

***

In [20]:
test = pd.read_csv("data/test.csv", parse_dates=["timestamp"])

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 4 columns):
row_id         int64
building_id    int64
meter          int64
timestamp      datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.2 GB


In [22]:
weather_test = pd.read_csv("data/weather_test.csv", parse_dates=["timestamp"])

In [23]:
weather_test.drop(["cloud_coverage"], axis=1, inplace=True)

***

In [24]:
test_data = (pd.merge(test, building_metadata, how="left", on=["building_id"])
             .merge(weather_test, how="left", on=["timestamp","site_id"]))

In [25]:
test_data = reduce_mem_usage(test_data)

Mem. usage decreased to 1908.76 Mb (57.1% reduction)


In [26]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 13 columns):
row_id                int32
building_id           int16
meter                 int8
timestamp             datetime64[ns]
site_id               int8
primary_use           object
square_feet           int32
air_temperature       float16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtypes: datetime64[ns](1), float16(6), int16(1), int32(2), int8(2), object(1)
memory usage: 1.9+ GB


In [27]:
test_data.to_hdf("data/test_data.h5", key="test_data")

***
### validation data

In [29]:
splitter = StratifiedKFold(n_splits=4, shuffle=True, random_state=23)
valid_indexes = [valid_index for _,valid_index in splitter.split(train_data, train_data['building_id'])]

In [40]:
h5f = h5py.File("data/valid_sm_skfold_4fold_shuffle.h5", "w")
for i,valid_index in enumerate(valid_indexes):
    h5f.create_dataset(f'fold{i}', data=valid_indexes[i])
h5f.close()

***
### custom validation data

In [20]:
from tqdm import tqdm

In [23]:
n_folds = 4
_train_data = train_data.loc[:, ["building_id","meter","timestamp"]]
ts_uid_values = _train_data.loc[:, ["building_id","meter"]].drop_duplicates()

In [None]:
valid_indexes = [[] for i in range(n_folds)]
generator = ghalton.Halton(1)

for _,row in tqdm(ts_uid_values.iterrows()):
    ts = _train_data.query("building_id == @row.building_id & meter == @row.meter")
    ts["week"] = ts.timestamp.dt.week
    weeks = ts.week.unique()
    n_weeks = len(weeks)
    
    sequence = np.asarray(generator.get(n_weeks))[:,0]
    idx = rankdata(sequence).astype(int)-1

    for i,weeks_idx in enumerate(np.array_split(idx, n_folds)):
        weeks_by_fold = weeks[weeks_idx]
        valid_indexes[i].append(ts.query("week in @weeks_by_fold").index.values)

13it [00:02,  4.82it/s]

***