## Divide and Conquer
This notebook is to explore features and optimize models for each site_id. The idea is to resolve some data discrepancies that are present by dividing the data rather than cleaning.   

Note that this is just another approach, need not necessarily be better or worse, but probably can add some value to ensembles irrespective of its CV or public LB scores.

In [198]:
import gc
import os

import lightgbm as lgb
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

from tqdm.notebook import tqdm
import datetime
import warnings
warnings.filterwarnings('ignore')

path_data = "./ashrae-energy-prediction/"
path_train = path_data + "train.csv"
path_test = path_data + "test.csv"
path_building = path_data + "building_metadata.csv"
path_weather_train = path_data + "weather_train.csv"
path_weather_test = path_data + "weather_test.csv"

myfavouritenumber = 38
seed = myfavouritenumber

## Preparing data
There are two files with features that need to be merged with the data. One is building metadata that has information on the buildings and the other is weather data that has information on the weather.

In [4]:
## Memory Optimization

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def target_encoding()

In [40]:
df_train.groupby(['meter', 'primary_use'])['meter_reading'].agg(['count', 'mean', 'median']).reset_index()

Unnamed: 0,meter,primary_use,count,mean,median
0,0,Education,4499129,250.736974,97.7212
1,0,Entertainment/public assembly,1494423,117.497227,32.2
2,0,Food sales and service,43826,94.794985,72.0
3,0,Healthcare,184345,297.179204,164.771
4,0,Lodging/residential,1137714,117.234521,66.05
5,0,Manufacturing/industrial,85216,90.659198,52.059
6,0,Office,2230893,163.042765,68.0
7,0,Other,195256,74.837913,13.2
8,0,Parking,160375,68.409131,54.01
9,0,Public services,1320045,98.712629,36.74


In [249]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

building = pd.read_csv(path_building)
le = LabelEncoder()
# primary_useのLabelEncoding→あとでTargetEncodingに
building.primary_use = le.fit_transform(building.primary_use)
#building['age'] = building['year_built'].max() - building['year_built'] + 1

weather_train = pd.read_csv(path_weather_train)
weather_test = pd.read_csv(path_weather_test)

# これらの特徴量は使わない
weather_train.drop(["sea_level_pressure", "wind_direction", "wind_speed"], axis=1, inplace=True)
weather_test.drop(["sea_level_pressure", "wind_direction", "wind_speed"], axis=1, inplace=True)

# site_idでgroup_by、interpolation補間
weather_train = weather_train.groupby("site_id").apply(lambda group: group.interpolate(limit_direction="both"))
weather_test = weather_test.groupby("site_id").apply(lambda group: group.interpolate(limit_direction="both"))

# trainとbuilding_metadataをmerge
df_train = df_train.merge(building, on="building_id")
# さらにweather_trainをmerge
df_train = df_train.merge(weather_train, on=["site_id", "timestamp"], how="left")
# 多分外れ値除去
df_train = df_train[~((df_train.site_id==0) & (df_train.meter==0) & (df_train.building_id <= 104) & (df_train.timestamp < "2016-05-21"))]
df_train.reset_index(drop=True, inplace=True)
# datetime型に変換
df_train.timestamp = pd.to_datetime(df_train.timestamp, format='%Y-%m-%d %H:%M:%S')
# targetをlog変換
df_train["log_meter_reading"] = np.log1p(df_train.meter_reading)

# testとbuilding_metadataをmerge
df_test = df_test.merge(building, on="building_id")
# さらにweather_testをmerge
df_test = df_test.merge(weather_test, on=["site_id", "timestamp"], how="left")
df_test.reset_index(drop=True, inplace=True)
# datetime型に
df_test.timestamp = pd.to_datetime(df_test.timestamp, format='%Y-%m-%d %H:%M:%S')

del building, le
gc.collect()

0

In [250]:
df_train = reduce_mem_usage(df_train, use_float16=True)
df_test = reduce_mem_usage(df_test, use_float16=True)

# weatherのほうもdatetime型に変換
weather_train.timestamp = pd.to_datetime(weather_train.timestamp, format='%Y-%m-%d %H:%M:%S')
weather_test.timestamp = pd.to_datetime(weather_test.timestamp, format='%Y-%m-%d %H:%M:%S')
weather_train = reduce_mem_usage(weather_train, use_float16=True)
weather_test = reduce_mem_usage(weather_test, use_float16=True)

Memory usage of dataframe is 2122.08 MB
Memory usage after optimization is: 663.15 MB
Decreased by 68.7%
Memory usage of dataframe is 4135.66 MB
Memory usage after optimization is: 1312.28 MB
Decreased by 68.3%
Memory usage of dataframe is 6.40 MB
Memory usage after optimization is: 2.27 MB
Decreased by 64.6%
Memory usage of dataframe is 12.69 MB
Memory usage after optimization is: 4.49 MB
Decreased by 64.6%


## Feature Engineering: Time
Creating time-based features.

In [251]:
# hourと曜日 (weekday) と月を追加
df_train["hour"] = df_train.timestamp.dt.hour
df_train["weekday"] = df_train.timestamp.dt.weekday
#df_train["month"] = df_train.timestamp.dt.month

df_test["hour"] = df_test.timestamp.dt.hour
df_test["weekday"] = df_test.timestamp.dt.weekday
#df_test["month"] = df_test.timestamp.dt.month

## Feature Engineering: Aggregation
Creating aggregate features for buildings at various levels.

In [252]:
# 建物、meterごとにmeter_readingの平均値と中央値を追加
df_building_meter = df_train.groupby(["building_id", "meter"]).agg(mean_building_meter=("log_meter_reading", "mean"),
                                                             median_building_meter=("log_meter_reading", "median")).reset_index()

df_train = df_train.merge(df_building_meter, on=["building_id", "meter"])
df_test = df_test.merge(df_building_meter, on=["building_id", "meter"])

# 建物、meter、時間ごとにmeter_readingの平均値と中央値を追加
df_building_meter_hour = df_train.groupby(["building_id", "meter", "hour"]).agg(mean_building_meter=("log_meter_reading", "mean"),
                                                                                median_building_meter=("log_meter_reading", "median")).reset_index()

df_train = df_train.merge(df_building_meter_hour, on=["building_id", "meter", "hour"])
df_test = df_test.merge(df_building_meter_hour, on=["building_id", "meter", "hour"])

## Feature Engineering: Lags
Creating lag-based features. These are statistics of available features looking back in time by fixed intervals.   
These features are created in the weather data itself and then merged with the train and test data.

In [216]:
def create_lag_features(df, window):
    """
    Creating lag-based features looking back in time.
    """
    # 気温、雲の割合、湿度、降水量/hについて
    feature_cols = ["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"]
    # site_idでgroupby
    df_site = df.groupby("site_id")
    
    # windowごとに（欠損値なしで）まとめる
    df_rolled = df_site[feature_cols].rolling(window=window, min_periods=0)
    
    # windowごとの平均値、中央値、最小値、最大値、標準偏差、歪度（分布の非対称性）
    df_mean = df_rolled.mean().reset_index().astype(np.float16)
    df_median = df_rolled.median().reset_index().astype(np.float16)
    df_min = df_rolled.min().reset_index().astype(np.float16)
    df_max = df_rolled.max().reset_index().astype(np.float16)
    df_std = df_rolled.std().reset_index().astype(np.float16)
    df_skew = df_rolled.skew().reset_index().astype(np.float16)
    
    # 各特徴量について、上記の値を追加
    for feature in feature_cols:
        df[f"{feature}_mean_lag{window}"] = df_mean[feature]
        df[f"{feature}_median_lag{window}"] = df_median[feature]
        df[f"{feature}_min_lag{window}"] = df_min[feature]
        df[f"{feature}_max_lag{window}"] = df_max[feature]
        df[f"{feature}_std_lag{window}"] = df_std[feature]
        df[f"{feature}_skew_lag{window}"] = df_std[feature]
        
    return df

## Features
Creating and selecting all the features.

In [253]:
weather_train = create_lag_features(weather_train, 18)
# rolling前の数値は除去
weather_train.drop(["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"], axis=1, inplace=True)

df_train = df_train.merge(weather_train, on=["site_id", "timestamp"], how="left")

del weather_train
gc.collect()

38

In [254]:
# categorical変数
categorical_features = [
    "building_id",
    "primary_use",
    "meter",
    "weekday",
    "hour"
]

all_features = [col for col in df_train.columns if col not in ["timestamp", "site_id", "meter_reading", "log_meter_reading"]]

## KFold Cross Validation with LGBM
Since the test data is out of time and longer than train data, creating a reliable validation strategy is going to be a major challenge. Just using a simple KFold CV here.

The folds are applied to each site individually, thus building 16 sites x 3 folds = 48 models in total.

In [258]:
cv = 5
models = {}
cv_scores = {"(site_id, meter)": [], "cv_score": []}

for site_id in tqdm(range(16), desc="site_id"):
    for meter in range(4):
        print(cv, "fold CV for site_id:", site_id, "for meter:", meter)
        kf = TimeSeriesSplit(n_splits=cv)
        # modelを16×4個(site_id×meter分)作る
        #models[(site_id, meter)] = []

        # あるsite_id(0~15)、meter(0~3)のデータセットについて
        X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter)].reset_index(drop=True)
        if X_train_site.shape[0] == 0:
            continue
        y_train_site = X_train_site.log_meter_reading
        y_pred_train_site = np.zeros(X_train_site.shape[0])
    
        score = 0

        for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site, y_train_site)):
            X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
            y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]

            dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
            dvalid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=categorical_features)

            watchlist = [dtrain, dvalid]

            params = {"objective": "regression",
                      "num_leaves": 41,
                      "learning_rate": 0.049,
                      "bagging_freq": 5,
                      "bagging_fraction": 0.51,
                      "feature_fraction": 0.81,
                      "metric": "rmse"
                      }

            model_lgb = lgb.train(params, train_set=dtrain, num_boost_round=1500, valid_sets=watchlist, verbose_eval=100, early_stopping_rounds=20)
            #models[(site_id, meter)].append(model_lgb)

            y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
            y_pred_train_site[valid_index] = y_pred_valid

            rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
            print("Site Id:", site_id, ", Meter:", meter, ", Fold:", fold+1, ", RMSE:", rmse)
            score += rmse / cv
        
            gc.collect()
        
        model_all = lgb.LGBMRegressor(**params, num_boost_round=int(np.floor(model_lgb.best_iteration * 1.25)))
        model_all.fit(X_train_site.loc[:, all_features], y_train_site, categorical_feature=categorical_features)
        models[(site_id, meter)] = model_all
        
        cv_scores["(site_id, meter)"].append((site_id, meter))
        cv_scores["cv_score"].append(score)
        
        print("\nSite Id:", site_id, ", Meter:", meter, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

5 fold CV for site_id: 0 for meter: 0
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[59]	training's rmse: 0.205353	valid_1's rmse: 0.305764
Site Id: 0 , Meter: 0 , Fold: 1 , RMSE: 0.3057636874674805
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.185798	valid_1's rmse: 0.47988
Early stopping, best iteration is:
[135]	training's rmse: 0.174737	valid_1's rmse: 0.476438
Site Id: 0 , Meter: 0 , Fold: 2 , RMSE: 0.4764972490205646
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[55]	training's rmse: 0.250755	valid_1's rmse: 0.525424
Site Id: 0 , Meter: 0 , Fold: 3 , RMSE: 0.5254235646056596
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.282266	valid_1's rmse: 0.292107
Early stopping, best iteration is:
[114]	training's rmse: 0.27542	valid_1's rmse: 0.291734
Site Id: 0 , Meter: 0 , Fold: 4 , RMSE: 0.2912720438468358
Trai

[70]	training's rmse: 0.855302	valid_1's rmse: 1.30725
Site Id: 2 , Meter: 1 , Fold: 5 , RMSE: 1.22777309803395

Site Id: 2 , Meter: 1 , CV RMSE: 2.3092541099784385 

5 fold CV for site_id: 2 for meter: 2
5 fold CV for site_id: 2 for meter: 3
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.548327	valid_1's rmse: 0.885618
Early stopping, best iteration is:
[107]	training's rmse: 0.542304	valid_1's rmse: 0.884732
Site Id: 2 , Meter: 3 , Fold: 1 , RMSE: 0.9246840884059377
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.605004	valid_1's rmse: 0.80957
[200]	training's rmse: 0.549381	valid_1's rmse: 0.797302
Early stopping, best iteration is:
[267]	training's rmse: 0.523309	valid_1's rmse: 0.793026
Site Id: 2 , Meter: 3 , Fold: 2 , RMSE: 0.811821660054076
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.614596	valid_1's rmse: 1.04733
Early stopping, best iteration is:
[145]	traini

[170]	training's rmse: 0.147732	valid_1's rmse: 0.439747
Site Id: 6 , Meter: 0 , Fold: 2 , RMSE: 0.4387275297575619
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.197498	valid_1's rmse: 0.208703
Early stopping, best iteration is:
[131]	training's rmse: 0.187379	valid_1's rmse: 0.200431
Site Id: 6 , Meter: 0 , Fold: 3 , RMSE: 0.19917454854276964
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.186777	valid_1's rmse: 0.235237
[200]	training's rmse: 0.165779	valid_1's rmse: 0.227859
[300]	training's rmse: 0.153622	valid_1's rmse: 0.227096
Early stopping, best iteration is:
[298]	training's rmse: 0.153767	valid_1's rmse: 0.227065
Site Id: 6 , Meter: 0 , Fold: 4 , RMSE: 0.23061204287954445
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.180015	valid_1's rmse: 0.250552
[200]	training's rmse: 0.159062	valid_1's rmse: 0.245559
[300]	training's rmse: 0.147547	valid_1's rmse: 0.24418

[200]	training's rmse: 0.932641	valid_1's rmse: 0.980887
[300]	training's rmse: 0.867846	valid_1's rmse: 0.969369
[400]	training's rmse: 0.82196	valid_1's rmse: 0.963653
Early stopping, best iteration is:
[385]	training's rmse: 0.829892	valid_1's rmse: 0.963069
Site Id: 7 , Meter: 2 , Fold: 5 , RMSE: 1.0219807192715125

Site Id: 7 , Meter: 2 , CV RMSE: 2.5161641781278368 

5 fold CV for site_id: 7 for meter: 3
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.926485	valid_1's rmse: 1.74055
Early stopping, best iteration is:
[100]	training's rmse: 0.926485	valid_1's rmse: 1.74055
Site Id: 7 , Meter: 3 , Fold: 1 , RMSE: 1.740546648921632
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[60]	training's rmse: 1.20506	valid_1's rmse: 3.06103
Site Id: 7 , Meter: 3 , Fold: 2 , RMSE: 3.0610339118152563
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.24135	valid_1's rmse: 1.7

[250]	training's rmse: 0.861887	valid_1's rmse: 1.21106
Site Id: 9 , Meter: 2 , Fold: 1 , RMSE: 1.2015778230773384
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.01035	valid_1's rmse: 1.36232
[200]	training's rmse: 0.937633	valid_1's rmse: 1.33223
Early stopping, best iteration is:
[218]	training's rmse: 0.928929	valid_1's rmse: 1.3306
Site Id: 9 , Meter: 2 , Fold: 2 , RMSE: 1.3103625525282467
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.06996	valid_1's rmse: 1.37005
[200]	training's rmse: 0.993173	valid_1's rmse: 1.34148
[300]	training's rmse: 0.956719	valid_1's rmse: 1.33617
Early stopping, best iteration is:
[351]	training's rmse: 0.944997	valid_1's rmse: 1.33455
Site Id: 9 , Meter: 2 , Fold: 3 , RMSE: 1.3270670990728157
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.07612	valid_1's rmse: 1.25261
Early stopping, best iteration is:
[95]	training's rmse: 1.08214	valid

[500]	training's rmse: 0.198374	valid_1's rmse: 0.489722
[600]	training's rmse: 0.185518	valid_1's rmse: 0.478101
[700]	training's rmse: 0.174799	valid_1's rmse: 0.470773
[800]	training's rmse: 0.167462	valid_1's rmse: 0.462489
Early stopping, best iteration is:
[814]	training's rmse: 0.16592	valid_1's rmse: 0.462479
Site Id: 11 , Meter: 1 , Fold: 2 , RMSE: 0.4624789203321367
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[33]	training's rmse: 0.714146	valid_1's rmse: 1.47487
Site Id: 11 , Meter: 1 , Fold: 3 , RMSE: 1.4748749629736597
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.488317	valid_1's rmse: 1.05228
Early stopping, best iteration is:
[101]	training's rmse: 0.48656	valid_1's rmse: 1.05005
Site Id: 11 , Meter: 1 , Fold: 4 , RMSE: 1.0349950249901307
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.554751	valid_1's rmse: 0.956391
Early stopping, best iter

Site Id: 13 , Meter: 1 , CV RMSE: 2.0659140687150677 

5 fold CV for site_id: 13 for meter: 2
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.06407	valid_1's rmse: 1.80798
[200]	training's rmse: 0.941768	valid_1's rmse: 1.80048
Early stopping, best iteration is:
[204]	training's rmse: 0.938925	valid_1's rmse: 1.80022
Site Id: 13 , Meter: 2 , Fold: 1 , RMSE: 1.7966483914952045
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.17322	valid_1's rmse: 1.54888
Early stopping, best iteration is:
[157]	training's rmse: 1.10579	valid_1's rmse: 1.54183
Site Id: 13 , Meter: 2 , Fold: 2 , RMSE: 1.6960838251309072
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 1.15012	valid_1's rmse: 1.51897
Early stopping, best iteration is:
[169]	training's rmse: 1.0774	valid_1's rmse: 1.51041
Site Id: 13 , Meter: 2 , Fold: 3 , RMSE: 1.5227447130377427
Training until validation scores don't improve for 2

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[69]	training's rmse: 0.276767	valid_1's rmse: 0.179061
Site Id: 15 , Meter: 0 , Fold: 2 , RMSE: 0.1795350727672336
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.225107	valid_1's rmse: 0.302003
Early stopping, best iteration is:
[179]	training's rmse: 0.19533	valid_1's rmse: 0.298417
Site Id: 15 , Meter: 0 , Fold: 3 , RMSE: 0.29798374852354637
Training until validation scores don't improve for 20 rounds
[100]	training's rmse: 0.240324	valid_1's rmse: 0.300227
[200]	training's rmse: 0.203353	valid_1's rmse: 0.291576
[300]	training's rmse: 0.184526	valid_1's rmse: 0.288553
[400]	training's rmse: 0.174471	valid_1's rmse: 0.28757
[500]	training's rmse: 0.166949	valid_1's rmse: 0.286906
Early stopping, best iteration is:
[492]	training's rmse: 0.167564	valid_1's rmse: 0.286838
Site Id: 15 , Meter: 0 , Fold: 4 , RMSE: 0.28887365511180213
Training until valid

In [259]:
pd.DataFrame.from_dict(cv_scores)

Unnamed: 0,"(site_id, meter)",cv_score
0,"(0, 0)",0.410399
1,"(0, 1)",1.623438
2,"(1, 0)",0.479225
3,"(1, 3)",1.943374
4,"(2, 0)",0.479664
5,"(2, 1)",1.154631
6,"(2, 3)",0.956113
7,"(3, 0)",0.408615
8,"(4, 0)",0.260476
9,"(5, 0)",0.67451


In [199]:
pd.DataFrame.from_dict(cv_scores)

Unnamed: 0,"(site_id, meter)",cv_score
0,"(0, 0)",0.410399
1,"(0, 1)",1.623438
2,"(1, 0)",0.479225
3,"(1, 3)",1.943374
4,"(2, 0)",0.479664
5,"(2, 1)",1.154631
6,"(2, 3)",0.956113
7,"(3, 0)",0.408615
8,"(4, 0)",0.260476
9,"(5, 0)",0.675664


In [200]:
del df_train, X_train_site, y_train_site, X_train, y_train, dtrain, X_valid, y_valid, dvalid, y_pred_train_site, y_pred_valid, rmse, score, cv_scores
gc.collect()

85

## Scoring on test data
The test data for each site is scored individually using the 3 models, one from each fold. The final prediction is the average of the 3 models.

In [221]:
weather_test = create_lag_features(weather_test, 18)
weather_test.drop(["air_temperature", "cloud_coverage", "dew_temperature", "precip_depth_1_hr"], axis=1, inplace=True)

In [264]:
df_test_sites = []

for site_id in tqdm(range(16), desc="site_id"):
    for meter in range(4):
        print("Preparing test data for (site_id, meter)", (site_id, meter))

        X_test_site = df_test[(df_test.site_id==site_id) & (df_test.meter==meter)]
        if X_test_site.shape[0] == 0:
            continue
        weather_test_site = weather_test[weather_test.site_id==site_id]
    
        X_test_site = X_test_site.merge(weather_test_site, on=["site_id", "timestamp"], how="left")
    
        row_ids_site = X_test_site.row_id

        X_test_site = X_test_site.loc[:, all_features]
        y_pred_test_site = np.zeros(X_test_site.shape[0])

        print("Scoring for site_id", site_id, "for meter", meter)    
        """
        for fold in range(cv):
            model_lgb = models[(site_id, meter)][fold]
            y_pred_test_site += model_lgb.predict(X_test_site, num_iteration=model_lgb.best_iteration) / cv
            gc.collect()
        """
        model_lgb = models[(site_id, meter)]
        y_pred_test_site = model_lgb.predict(X_test_site)
        
        df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
        df_test_sites.append(df_test_site)
    
        print("Scoring for site_id", site_id, "for meter", meter, "completed\n")
        gc.collect()

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

Preparing test data for (site_id, meter) (0, 0)
Scoring for site_id 0 for meter 0
Scoring for site_id 0 for meter 0 completed

Preparing test data for (site_id, meter) (0, 1)
Scoring for site_id 0 for meter 1
Scoring for site_id 0 for meter 1 completed

Preparing test data for (site_id, meter) (0, 2)
Preparing test data for (site_id, meter) (0, 3)
Preparing test data for (site_id, meter) (1, 0)
Scoring for site_id 1 for meter 0
Scoring for site_id 1 for meter 0 completed

Preparing test data for (site_id, meter) (1, 1)
Preparing test data for (site_id, meter) (1, 2)
Preparing test data for (site_id, meter) (1, 3)
Scoring for site_id 1 for meter 3
Scoring for site_id 1 for meter 3 completed

Preparing test data for (site_id, meter) (2, 0)
Scoring for site_id 2 for meter 0
Scoring for site_id 2 for meter 0 completed

Preparing test data for (site_id, meter) (2, 1)
Scoring for site_id 2 for meter 1
Scoring for site_id 2 for meter 1 completed

Preparing test data for (site_id, meter) (2, 2

## Submission
Preparing final file for submission.

In [235]:
today = str(datetime.date.today())

In [265]:
submit = pd.concat(df_test_sites)
submit.meter_reading = np.clip(np.expm1(submit.meter_reading), 0, a_max=None)
submit = submit.sort_values('row_id')
submit.to_csv(f"submission_noleak_{today}_2.csv", index=False)

In [209]:
## adding leak
leak0 = pd.read_csv("./ashrae-leak-data/site0.csv")
leak1 = pd.read_csv("./ashrae-leak-data/site1.csv")
leak2 = pd.read_csv("./ashrae-leak-data/site2.csv")
leak4 = pd.read_csv("./ashrae-leak-data/site4.csv")
leak15 = pd.read_csv("./ashrae-leak-data/site15.csv")

leak = pd.concat([leak0, leak1, leak2, leak4, leak15])

del leak0, leak1, leak2, leak4, leak15
gc.collect()

test = pd.read_csv(path_test)
test = test[test.building_id.isin(leak.building_id.unique())]

leak = leak.merge(test, on=["building_id", "meter", "timestamp"])

del test
gc.collect()

submit = submit.merge(leak[["row_id", "meter_reading_scraped"]], on=["row_id"], how="left")
submit.loc[submit.meter_reading_scraped.notnull(), "meter_reading"] = submit.loc[submit.meter_reading_scraped.notnull(), "meter_reading_scraped"] 
submit.drop(["meter_reading_scraped"], axis=1, inplace=True)

submit.to_csv(f"submission_{today}.csv", index=False)