# Random forest with Entity Embeddings: Training set

This kernel prepares the ASHRAE Energy Prediction dataset for training a Random Forest. It performs the same preprocessing as in [this kernel](https://www.kaggle.com/michelezoccali/ashrae-energy-prediction-single-lgbm), while substituting categorical features with the corresponding embedding vectors previously learned by a NN in https://www.kaggle.com/michelezoccali/ashrae-with-fast-ai-part-2.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import os
import datetime
import gc

In [2]:
path = '../input/ashrae-energy-prediction'

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/ashrae-energy-prediction/sample_submission.csv
../input/ashrae-energy-prediction/building_metadata.csv
../input/ashrae-energy-prediction/weather_train.csv
../input/ashrae-energy-prediction/weather_test.csv
../input/ashrae-energy-prediction/train.csv
../input/ashrae-energy-prediction/test.csv


# Utilities

Memory reduction adapted from [this kernel.](https://www.kaggle.com/purist1024/ashrae-simple-data-cleanup-lb-1-08-no-leaks/notebook)

In [3]:
def reduce_mem(df):
    result = df.copy()
    for col in result.columns:
        col_data = result[col]
        dn = col_data.dtype.name
        if not dn.startswith("datetime"):
            if dn == "object":  # only object feature has low cardinality
                result[col] = pd.to_numeric(col_data.astype("category").cat.codes, downcast="unsigned")
            elif dn.startswith("int") | dn.startswith("uint"):
                if col_data.min() >= 0:
                    result[col] = pd.to_numeric(col_data, downcast="unsigned")
                else:
                    result[col] = pd.to_numeric(col_data, downcast='integer')
            else:
                result[col] = pd.to_numeric(col_data, downcast='float')
    return result

Routine to add lag features to weather dataset, adapted from [this kernel](https://www.kaggle.com/corochann/ashrae-training-lgbm-by-meter-type/notebook).

In [4]:
def add_lag_features(weather_df, window=3):
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
        weather_df[f'{col}_max_lag{window}'] = lag_max[col]
    return weather_df

# Load data

In [5]:
def load_data(source='train'):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/{source}.csv', parse_dates=['timestamp'])
    return reduce_mem(df)

def load_building():
    df = pd.read_csv(f'{path}/building_metadata.csv').fillna(-1)
    return reduce_mem(df)

def load_weather(source='train', fix_timezone=True, impute=True, add_lag=True):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/weather_{source}.csv', parse_dates=['timestamp'])
    if fix_timezone:
        offsets = [5,0,9,6,8,0,6,6,5,7,8,6,0,7,6,6]
        offset_map = {site: offset for site, offset in enumerate(offsets)}
        df.timestamp = df.timestamp - pd.to_timedelta(df.site_id.map(offset_map), unit='h')
    if impute:
        site_dfs = []
        for site in df.site_id.unique():
            if source == 'train':
                new_idx = pd.date_range(start='2016-1-1', end='2016-12-31-23', freq='H')
            else:
                new_idx = pd.date_range(start='2017-1-1', end='2018-12-31-23', freq='H')
            site_df = df[df.site_id == site].set_index('timestamp').reindex(new_idx)
            site_df.site_id = site
            for col in [c for c in site_df.columns if c != 'site_id']:
                site_df[col] = site_df[col].interpolate(limit_direction='both', method='linear')
                site_df[col] = site_df[col].fillna(df[col].median())
            site_dfs.append(site_df)
        df = pd.concat(site_dfs)
        df['timestamp'] = df.index
        df = df.reset_index(drop=True)
        
    if add_lag:
        df = add_lag_features(df, window=3)
    
    return reduce_mem(df)

def merged_dfs(source='train', fix_timezone=True, impute=True, add_lag=True):
    df = load_data(source=source).merge(load_building(), on='building_id', how='left')
    df = df.merge(load_weather(source=source, fix_timezone=fix_timezone, impute=impute, add_lag=add_lag),
                 on=['site_id','timestamp'], how='left')
    if source == 'train':
        X = df.drop('meter_reading', axis=1)  
        y = np.log1p(df.meter_reading)  # log-transform of target
        return X, y
    elif source == 'test':
        return df

In [6]:
%%time
X_train, y_train = merged_dfs(add_lag=False)
X_train.head()

CPU times: user 21.3 s, sys: 4.39 s, total: 25.7 s
Wall time: 31 s


Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0,0,7432,2008.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
1,1,0,2016-01-01,0,0,2720,2004.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
2,2,0,2016-01-01,0,0,5376,1991.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
3,3,0,2016-01-01,0,0,23685,2002.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
4,4,0,2016-01-01,0,0,116607,1975.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0


# Outlier removal and basic FE

Let us remove the first 141 days of electrical meter readings at site 0, which are mostly zero or contain anomalous spikes. This is the type of outlier which causes the most trouble and is comparatively easier to remove. We also extract some basic temporal features.

In [7]:
def _delete_bad_sitezero(X, y):
    cond = (X.timestamp > '2016-05-20') | (X.site_id != 0) | (X.meter != 0)
    X = X[cond]
    y = y.reindex_like(X)
    return X.reset_index(drop=True), y.reset_index(drop=True)

def _extract_temporal(X):
    X['hour'] = X.timestamp.dt.hour
    X['weekday'] = X.timestamp.dt.weekday
    # month and year cause overfit, could try other (holiday, business, etc.)
    return reduce_mem(X)

In [8]:
# preprocessing
X_train, y_train = _delete_bad_sitezero(X_train, y_train)
X_train = _extract_temporal(X_train)

# remove timestamp and other unimportant features
to_drop = ['timestamp','sea_level_pressure','wind_direction','wind_speed']
X_train.drop(to_drop, axis=1, inplace=True)

gc.collect()

57

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19869886 entries, 0 to 19869885
Data columns (total 13 columns):
 #   Column             Dtype  
---  ------             -----  
 0   building_id        uint16 
 1   meter              uint8  
 2   site_id            uint8  
 3   primary_use        uint8  
 4   square_feet        uint32 
 5   year_built         float32
 6   floor_count        float32
 7   air_temperature    float32
 8   cloud_coverage     float32
 9   dew_temperature    float32
 10  precip_depth_1_hr  float32
 11  hour               uint8  
 12  weekday            uint8  
dtypes: float32(6), uint16(1), uint32(1), uint8(5)
memory usage: 663.2 MB


Save target and training set without embeddings.

In [10]:
y_train = y_train.to_frame()

In [11]:
X_train.to_feather('X_train.feather')
y_train.to_feather('y_train.feather')

# Concatenate embedding vectors

Now let us load the categorical embeddings learned with a NN [here](https://www.kaggle.com/michelezoccali/ashrae-with-fast-ai-part-2). 

This step is more easily performed on the GPU, where the original model was trained. It is sometimes necessary to perform this step on the CPU, however, so let us see here the simples changes required.

In [12]:
import pickle
from fastai.tabular.all import *

# Subclass the Unpickler to load cuda model on CPU
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

with open('../input/ashrae-with-fast-ai-part-2/tabular_nn.pickle', mode='rb') as f:
    #learn = pickle.load(f) becomes...
    learn = CPU_Unpickler(f).load()

In [13]:
cat_features = ['meter','site_id','primary_use','hour','weekday']

def add_embeds(learn, x):
    x = x.copy()
    for i, cat in enumerate(cat_features):
        emb = learn.embeds[i]
        vec = tensor(x[cat], dtype=torch.int64) # this is on cpu
        emb_data = emb(vec)
        emb_names = [f'{cat}_{j}' for j in range(emb_data.shape[1])]
        
        emb_df = pd.DataFrame(emb_data, index=x.index, columns=emb_names)
        x = x.drop(columns=cat)
        x = x.join(emb_df)
    return x

In [14]:
X_train = add_embeds(learn, X_train)
gc.collect()

40

In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19869886 entries, 0 to 19869885
Data columns (total 43 columns):
 #   Column             Dtype  
---  ------             -----  
 0   building_id        uint16 
 1   square_feet        uint32 
 2   year_built         float32
 3   floor_count        float32
 4   air_temperature    float32
 5   cloud_coverage     float32
 6   dew_temperature    float32
 7   precip_depth_1_hr  float32
 8   meter_0            float32
 9   meter_1            float32
 10  meter_2            float32
 11  meter_3            float32
 12  site_id_0          float32
 13  site_id_1          float32
 14  site_id_2          float32
 15  site_id_3          float32
 16  site_id_4          float32
 17  site_id_5          float32
 18  site_id_6          float32
 19  site_id_7          float32
 20  primary_use_0      float32
 21  primary_use_1      float32
 22  primary_use_2      float32
 23  primary_use_3      float32
 24  primary_use_4      float32
 25  primary_use_5   

Now save the DataFrame with the embeddings.

In [16]:
X_train.to_feather('X_embeds.feather')

Now on to modeling. Take a look at [Part 2](https://www.kaggle.com/michelezoccali/lgbm-with-entity-embeddings-part-2).