In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from mlens.ensemble import SuperLearner
from mlens.utils import pickle_save, pickle_load
from load_data_utils import *

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
[MLENS] backend: threading


# IMPORT DATA FUNCTIONS

In [21]:
train = load_train_data()

inital load (20216100, 4)
after building df load (20216100, 9)
after weather load (20216100, 18)


In [2]:
categorical = ["site_id", 
               "building_id", 
               "primary_use",
               'floor_count',
               "hour", 
               "day", 
               "weekend", 
               "month", 
               "meter"]

numerical = ["square_feet_log",
             "year_built",
            "air_temperature",
             "cloud_coverage",]

label_cols = ['primary_use', 'floor_count']

train_x, train_y, train_dict = prep_train_data(categorical, numerical, label_cols)

inital load (20216100, 4)
after building df load (20216100, 9)
after weather load (20216100, 18)
Memory usage of properties dataframe is : 3007.613754272461  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  meter_reading
dtype before:  float64
min for this col:  0.0
max for this col:  21904700.0
dtype after:  float32
******************************
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  primary_use
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
***************************

In [13]:
train_x.head()

Unnamed: 0,site_id,building_id,primary_use,floor_count,hour,day,weekend,month,meter,square_feet_log,year_built,air_temperature,cloud_coverage
0,0,0,0,3,0,1,4,1,0,0.406656,0.423529,0.712062,0.023529
1,0,1,0,3,0,1,4,1,0,0.281582,0.407843,0.712062,0.023529
2,0,2,0,3,0,1,4,1,0,0.366359,0.356863,0.712062,0.023529
3,0,3,0,3,0,1,4,1,0,0.550878,0.4,0.712062,0.023529
4,0,4,0,3,0,1,4,1,0,0.749218,0.294118,0.712062,0.023529


In [4]:
test_x = prep_test_data(categorical, numerical, train_dict)

inital load (41697600, 4)
after building df load (41697600, 9)
after weather load (41697600, 18)
Memory usage of properties dataframe is : 6203.485107421875  MB
******************************
Column:  row_id
dtype before:  int64
min for this col:  0
max for this col:  41697599
dtype after:  uint32
******************************
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  primary_use
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
**********

In [14]:
LGBMRegressor(feature_fraction=0.7,
                                num_leaves=990, 
                                learning_rate=0.1, 
                                n_estimators=700, 
                                subsample_for_bin=200000,  
                                subsample=0.2, 
                                reg_alpha=0.1, 
                                reg_lambda=0.1, 
                                )

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.7, importance_type='split', learning_rate=0.1,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=700, n_jobs=-1, num_leaves=990,
              objective=None, random_state=None, reg_alpha=0.1, reg_lambda=0.1,
              silent=True, subsample=0.2, subsample_for_bin=200000,
              subsample_freq=0)

In [15]:
XGBRegressor(max_depth=7,
                                   learning_rate=0.1,
                                   n_estimators=1200,
                                   gamma=0.1, 
                                   subsample=0.7, 
                                   reg_alpha=0.1, 
                                   reg_lambda=0.1, 
                                   num_parallel_tree=-1
                                  )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.1,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=1200,
             n_jobs=1, nthread=None, num_parallel_tree=-1,
             objective='reg:linear', random_state=0, reg_alpha=0.1,
             reg_lambda=0.1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.7, verbosity=1)

In [13]:
#Import preprocessed data
train_df = pd.read_csv('./notebooks/data/train_fe.csv')

# SUPER LEARNER FUNCTIONS

In [10]:
def get_models():
    models = list()
    models.append(LinearRegression())
    models.append(ElasticNet())
    models.append(RandomForestRegressor(n_estimators=1000))
    models.append(KNeighborsRegressor(n_neighbors=3))
    models.append(LGBMRegressor(feature_fraction=0.7,
                                num_leaves=990, 
                                learning_rate=0.1, 
                                n_estimators=700, 
                                subsample_for_bin=200000,  
                                subsample=0.2, 
                                reg_alpha=0.1, 
                                reg_lambda=0.1, 
                                ))
    return models

In [11]:
def rmsle(yreal, yhat):
    return np.sqrt(np.mean(np.square(np.log1p(yreal) - np.log1p(yhat))))

In [12]:
def get_super_learner(X):
    ensemble = SuperLearner(scorer=rmsle, 
                            folds=3, 
                            shuffle=True,
                            sample_size=len(X),
                            verbose=2
                           )
    models_set_1 = get_models()
    models_set_2 = get_models()
    models_set_3 = get_models()
    ensemble.add(models_set_1)
    ensemble.add(models_set_2)
    ensemble.add(models_set_3)
    ensemble.add_meta(LGBMRegressor(feature_fraction=0.7,
                                num_leaves=990, 
                                learning_rate=0.1, 
                                n_estimators=1800, 
                                subsample_for_bin=200000,  
                                subsample=0.2, 
                                reg_alpha=0.2, 
                                reg_lambda=0.2, 
                                ))
    return ensemble

In [21]:
xgb_test = XGBRegressor(max_depth=7,
               learning_rate=0.1,
               n_estimators=1200,
               gamma=0.1, 
               subsample=0.7, 
               reg_alpha=0.1, 
               reg_lambda=0.1, 
               num_parallel_tree=500)

In [None]:
xgb_test.fit(train_df, target)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




# Training Runs

In [14]:
target = train_df['meter_reading']
del train_df['meter_reading']

In [15]:
train_x, val_x, train_y, val_y = train_test_split(train_df, target, test_size=0.6)

In [8]:
target.value_counts()

0.0000        1531231
20.0000         23363
2.9307          23181
36.6000         22154
8.7921          21787
               ...   
12186.6000          1
12197.9000          1
36.9656             1
4.7889              1
96.4177             1
Name: meter_reading, Length: 1682937, dtype: int64

In [None]:
ensemble = get_super_learner(train_x.values)
ensemble.fit(train_x.values, train_y.values)
print(ensemble.data)


Fitting 4 layers
Processing layer-1             

  
  
  
  
  
  
  


In [20]:
yhat= ensemble.predict(val_x)
print('Super Learner: RMSLE %.5f' % (rmsle(val_y, yhat)))


Predicting 2 layers
Processing layer-1             done | 00:23:26
Processing layer-2             done | 00:00:00
Predict complete                    | 00:23:27
Super Learner: RMSE 0.17556


In [25]:
pickle_save(ensemble, 'sl_model_v1.pkl')

In [10]:
ensemble_v1=pickle_load('sl_model_v1.pkl')

In [11]:
predictions = ensemble_v1.predict(test_x)


Predicting 2 layers




Processing layer-1             done | 00:52:34
Processing layer-2             done | 00:00:00
Predict complete                    | 00:52:36


In [13]:
predictions.shape

(41697600,)

In [16]:
def make_submission(predicitons):
    submission = pd.read_csv('/Users/ns/code/nicholasjhana/ashrae-energy-prediction/data/sample_submission.csv')

    submission['meter_reading'] = predictions
    submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
    submission.to_csv('submission_sl_v2.csv', index=False)
    return submission

In [17]:
submission = make_submission(predictions)

In [60]:
def gen_fib():
    initial_fibs = [1, 1]
    fibs = []
    for x in range(1,20):
        if x==1:
            fibs.append(initial_fibs[x])
            fibs.append(initial_fibs[x])
            new_fib = initial_fibs[x] * 2
        else:
            new_fib = fibs[x] + fibs[x-1]
        fibs.append(new_fib)
        yield fibs



In [56]:
gen_fib()

[1,
 1,
 2,
 3,
 5,
 8,
 13,
 21,
 34,
 55,
 89,
 144,
 233,
 377,
 610,
 987,
 1597,
 2584,
 4181,
 6765,
 10946]

In [63]:
next(gen_fib())

[1, 1, 2]