In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

import lightgbm as lgb

from funs import utils

In [2]:
%%time
train_df, test_df, sample_sub = utils.load_clean_traintest_csv()

Wall time: 11.4 s


In [3]:
# before dropping, this is needed
submission = test_df[["fullVisitorId"]].copy()

In [4]:
dev_X, dev_y, val_X, val_y, test_X = utils.create_devvaltest_based_on_date(train_df, test_df)

In [5]:
col_to_drop_for_sure = ['visitStartTime', 'date']
col_to_drop_bc_target = ['totals.transactionRevenue', 'totals.transactionRevenue_log1p']
col_to_drop_for_now  = ['fullVisitorId', 'sessionId', 'trafficSource.referralPath', 'visitId']

col_to_drop = col_to_drop_for_sure + col_to_drop_bc_target + col_to_drop_for_now

In [6]:
# in dev_X and val_X: remove them all
for data in [dev_X, val_X]:
    data.drop(col_to_drop, axis=1, inplace=True)
    
# in test_X, there is no target
test_X.drop(col_to_drop_for_sure + col_to_drop_for_now, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


-------

In [7]:
# assertions
assert dev_X.shape[1] == val_X.shape[1], 'num of columns of dev and val not equal'
assert (~(dev_X.columns == val_X.columns)).sum() == 0, 'col names of dev and val are same'

------

In [8]:
%%time
# custom function to run light gbm model
# source:   https://www.kaggle.com/sudalairajkumar/simple-exploration-baseline-ga-customer-revenue

def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model

# Training the model #
pred_test, model = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 1.6886
[200]	valid_0's rmse: 1.68738
Early stopping, best iteration is:
[195]	valid_0's rmse: 1.68705
Wall time: 12 s


In [10]:
submission = utils.create_submission(existing_submission=submission, 
                                     pred_test = pred_test,
                                     MODUS_groupby = 'sum', 
                                     output_file_name = './submissions/submission_test.csv')

In [11]:
# feature importances
print('Feature importances:', list(model.feature_importance()))

Feature importances: [67, 254, 33, 25, 32, 185, 195, 54, 94, 190, 213, 96, 104, 31, 735, 52, 706, 6, 2, 26, 1, 0, 0, 2, 68, 40, 67, 148, 13, 51, 179, 92, 181, 188, 71, 104, 485, 220, 22, 318, 305]


In [None]:
# follow-up

In [None]:
# https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [40, 60, 100, 200]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)

gbm.fit(dev_X, dev_y)

In [None]:
gbm.best_estimator_

In [None]:
pred_test_grid = gbm.best_estimator_.predict(test_X)