In [1]:
import pandas as pd
train = pd.read_csv('../data/train.csv', sep=",")
test = pd.read_csv('../data/test.csv')

train_ID = train['Id']
test_ID = test['Id']

train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)

SalePrice = train['SalePrice']
train.drop('SalePrice', axis=1, inplace = True)

### preprocessing steps:

In [2]:
categorical = [var for var in train.columns if train[var].dtype=='O']
numerical = [var for var in train.columns if train[var].dtype!='O']

### preprocessing steps as done with elasticnet

Perhaps, we should have a look in the [documentation](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [3]:
import lightgbm
model = lightgbm.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, 
                                n_estimators=500, subsample_for_bin=20000, objective='regression', 
                                subsample=1.0, subsample_freq=0, colsample_bytree=1.0, 
                                n_jobs=- 1, silent=False, importance_type='split',
                                scale_pos_weight = 1.0,
                                reg_alpha = 0.05, # L1
                                reg_lambda = 0.05)  # L2
# an alias for the boosting_type is 'gbrt' = gradient boosted regression trees

In [4]:
import sys
import os
import numpy as np
sys.path.append(os.path.abspath('../scripts'))
from interactions_transformer import InteractionsTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# box-cox needs strictly positive values:

train[numerical] += 1

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', PowerTransformer(method='box-cox'))])

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical),
        ])

clf = cross_val_score(
        make_pipeline(
            preprocessor,
            InteractionsTransformer(),
            model
            ),
         train, np.log1p(SalePrice), scoring = 'neg_mean_squared_error'
)

starting feature importance
starting feature importance


  x = um.multiply(x, x, out=x)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21186
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 286
[LightGBM] [Info] Start training from score 12.021409


starting feature importance


  x = um.multiply(x, x, out=x)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22552
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 296
[LightGBM] [Info] Start training from score 12.023288


starting feature importance


  x = um.multiply(x, x, out=x)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19135
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 299
[LightGBM] [Info] Start training from score 12.020737


starting feature importance


  x = um.multiply(x, x, out=x)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21693
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 296
[LightGBM] [Info] Start training from score 12.032956


starting feature importance


  x = um.multiply(x, x, out=x)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19413
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 284
[LightGBM] [Info] Start training from score 12.021897


In [None]:
np.mean(np.sqrt(-clf))

### preprocessing steps in the lightgbm-way

In [None]:
import lightgbm
model = lightgbm.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, 
                                n_estimators=500, subsample_for_bin=20000, objective='regression', 
                                subsample=1.0, subsample_freq=0, colsample_bytree=1.0, 
                                n_jobs=- 1, silent=False, importance_type='split',
                                scale_pos_weight = 1.0,
                                reg_alpha = 0.05, # L1
                                reg_lambda = 0.05,  # L2
                                categorical_feature = categorical)

In [None]:
numeric_features = numerical
categorical_features = categorical

for cat_feat in categorical_features:
    train[cat_feat] = train[cat_feat].astype('category')
    
clf = cross_val_score(
        make_pipeline(
            model
            ),
         train, np.log1p(SalePrice), scoring = 'neg_mean_squared_error'
)

In [None]:
np.mean(np.sqrt(-clf))

In [None]:
clf = GridSearchCV(
        make_pipeline(
            model
            ),
            param_grid={'lgbmregressor__max_depth': [3, 4, 5],
                        'lgbmregressor__n_estimators' : [900, 1000, 1100],
                        'lgbmregressor__learning_rate': 10**(np.linspace(-2, 0, 3)),
                        'lgbmregressor__subsample': [0.8],
                        'lgbmregressor__reg_alpha': [0.0],
                        'lgbmregressor__reg_lambda': [0.1]
},
         cv=5, refit=False, scoring = 'neg_mean_squared_error'
)
clf.fit(train, np.log1p(SalePrice))

In [None]:
#clf.grid_scores_, 
clf.best_params_, np.sqrt(-clf.best_score_)

### the GBM of sklearn allows for single-tree predictions
We discussed this in the first session: the leaf-information of the single trees of a GBM could be interesting categorical features for linear classifiers. For the example with elasticnet, we choose the most important variables and added their interaction terms to the other variables. The first trees of a GBM should automatically contain the most important variable splits and interactions.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import FunctionTransformer
model = GradientBoostingRegressor(learning_rate = 0.01, max_depth=4, n_estimators=100, subsample=0.8)

train[numerical] = train[numerical].astype('float32')
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median'))])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)      
pipeline = make_pipeline(
            ColumnTransformer(transformers=[('num', numeric_transformer, numerical), 
                                           ('empty', categorical_transformer, categorical)]),
            model
            )
pipeline.fit(train, np.log1p(SalePrice))

In [None]:
tree_features = []
for j in range(0, 15):
    train['gbm_feature_' + str(j)]= pipeline['gradientboostingregressor'].estimators_[j][0].predict(pipeline['columntransformer'].fit_transform(train))
    train['gbm_feature_' + str(j)] = train['gbm_feature_' + str(j)].astype('category')
    tree_features.append('gbm_feature_' + str(j))


In [None]:
categorical = categorical + tree_features

In [None]:
train.columns

In [None]:
import sys
import os
import numpy as np
from sklearn.linear_model import ElasticNet
sys.path.append(os.path.abspath('../scripts'))
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

numeric_features = numerical
categorical_features = categorical

numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
('scaler', PowerTransformer(method='box-cox'))])

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical),
        ])

clf = GridSearchCV(
        make_pipeline(
            preprocessor,
            ElasticNet()
            ),
            param_grid={'elasticnet__alpha': 10**(np.linspace(-2, 0.2, 5)),
                        'elasticnet__l1_ratio': np.linspace(0, 1, 6)},
         cv=5, refit=True, scoring = 'neg_mean_squared_error'
)


In [None]:
train[numerical] += 1
clf.fit(train, np.log1p(SalePrice))

In [None]:
display(np.sqrt(-clf.best_score_), clf.best_params_)

### this is an interesting exercise for our brains: what is happening here???
  - can we do stacking?
  - what might be the effect of the learning-rate in the gbm?
  - can we say, the gbm breaks non-linearities in continuous variables and is modelling interactions?
  - or: is the linear model applying individual weights to each tree - instead of one learning rate for the whole tree? After fitting the individual trees sequentially (gbm), is the linear model re-weighing the single-leaf predictions in one step, all together?

### What we could do:
  - split the whole data-set into 5 parts (sklearn.model_selection.StratifiedKFold)
  - train 4 parts of the data with the GradientBoostedRegressor
  - apply the fitted GradientBoostedRegressor to the train-set (4 parts) as well as to the test-set (1 part)
  - make parameter-search for elastic-net in the train-set (4 parts) and apply to the test-set (1 part)
  - collect results in a list
  - repeat 5 times and report average result
  - perhaps repeat and change the learning-rate for the GradientBoostedRegressor until we find (no more) overfitting

In [None]:
import pandas as pd
train = pd.read_csv('../data/train.csv', sep=",")
test = pd.read_csv('../data/test.csv')

train_ID = train['Id']
test_ID = test['Id']

train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)

SalePrice = train['SalePrice']
train.drop('SalePrice', axis=1, inplace = True)


numerical = [var for var in train.columns if train[var].dtype!='O']

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(train, SalePrice)

performance = []
for train_index, test_index in skf.split(train, SalePrice):
    
    categorical = [var for var in train.columns if train[var].dtype=='O']
    train_X = train.loc[train_index, :]
    test_X =train.loc[test_index, :]
    train_y = SalePrice[train_index]
    test_y = SalePrice[test_index]
    pipeline.fit(train_X, np.log1p(train_y))
    
    tree_features = []
    for j in range(0, 15):
        train_X['gbm_feature_' + str(j)]= pipeline['gradientboostingregressor'].estimators_[j][0].predict(pipeline['columntransformer'].fit_transform(train_X))
        train_X['gbm_feature_' + str(j)] = train_X['gbm_feature_' + str(j)].astype('category')
        test_X['gbm_feature_' + str(j)]= pipeline['gradientboostingregressor'].estimators_[j][0].predict(pipeline['columntransformer'].transform(test_X))
        test_X['gbm_feature_' + str(j)] = test_X['gbm_feature_' + str(j)].astype('category')
        tree_features.append('gbm_feature_' + str(j))
        
    categorical = categorical + tree_features
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', PowerTransformer(method='box-cox'))])

    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical),
            ('cat', categorical_transformer, categorical),
            ])

    clf = GridSearchCV(
            make_pipeline(
                preprocessor,
                ElasticNet()
                ),
                param_grid={'elasticnet__alpha': 10**(np.linspace(-2, 0.2, 5)),
                            'elasticnet__l1_ratio': np.linspace(0, 1, 6)},
             cv=5, refit=True, scoring = 'neg_mean_squared_error'
    )
    train_X[numerical] += 1
    test_X[numerical] += 1
    clf.fit(train_X, np.log1p(train_y))
    performance.append(np.sqrt(np.mean((clf.predict(test_X)-np.log1p(test_y))**2)))




In [None]:
performance



In [None]:
np.mean(performance)

what seemed to be a good idea turns out to be __overfitting__