In [8]:
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
import numpy as np
import pandas as pd
import lightgbm as lgb
import sys
sys.path.append('../models/blend/')
from utils import customized_eval

In [9]:
#---------------------
# setting
#---------------------
BINARY_SCENARIO = None
#---------------------
# load features
#---------------------
feature_dir = '../features/lazada_and_amazon/all_features.h5'
df = pd.read_hdf(feature_dir)
#---------------------
# label post-processing
#---------------------
if df.label.nunique() == 2: 
    BINARY_SCENARIO = True
    # binary class
    df['label'] = df.label.apply(lambda x: 1 if x == 2 else 0) # for customized f1 score inference of lgb
else:
    # multi-class(B, I or O)
    pass

In [10]:
df.shape

(238668, 575)

In [11]:
#---------------------
# setting
#---------------------
target_col = 'label'
features = df.columns.tolist()[7:]
# parameters
NUM_LEAVES = 31
COLSAMPLE_BYTREE = 1.0
SUBSAMPLE = 1.0
SUBSAMPLE_FREQ = 0
MAX_DEPTH = -1
REG_LAMBDA = 0.0
REG_LAMBDA = 0.0
REG_ALPHA = 0.0
MIN_SPLIT_GAIN = 0
MIN_CHILD_WEIGHT = 0.001
MAX_BIN = 255


In [12]:

group_fold = True
target = 'label'
n_splits = 5
def lgb_eval(train, features, target_col,
             num_leaves,max_depth,lambda_l2,
             lambda_l1,min_child_samples,bagging_fraction,
             feature_fraction):
    '''
    Notice that:
        Bayesian optimization is designed to find optimal value through maximization.
        So, if yur target function is loss function. For example, rmse. the lower, the better. 
        Don't forget to put negative into the target function.
        However, if your target function is evluation metric. For example, f1-score. the higher, the better.
        There is no need to put negative when returning.
    '''
    #--------------------
    # setting
    #--------------------
    fit_params = {
    "objective": 'binary',
    "metric": 'None', # Please remember do specify metric == None for using custom evaluation metrci to do early stopping. 
    "num_leaves" : int(num_leaves),
    "max_depth" : int(max_depth),
    "lambda_l2" : lambda_l2,
    "lambda_l1" : lambda_l1,
    "num_threads" : 4,
    "min_child_samples" : int(min_child_samples),
    "learning_rate" : 0.03,
    "bagging_fraction" : bagging_fraction,
    "feature_fraction" : feature_fraction,
    "subsample_freq" : 5,
    "bagging_seed" : 42,
    "verbosity" : -1
    }
    col_need_for_computing_f1 = ['item_id']

    #--------------------
    # convert data into lgb.dataset
    #--------------------
    y = train[target_col].copy()
    dtrain = lgb.Dataset(train[features], y, free_raw_data=False)
    item_id_for_customized_f1 = train[col_need_for_computing_f1].iloc[val_idx].values.reshape(len(val_idx))
    #--------------------
    # CV
    #--------------------
    if group_fold == True:
        gfolds = GroupKFold(n_splits)
        unique_vis = np.array(sorted(train['fullVisitorId'].astype(str).unique()))
        #--------------------------------
        # Here we need our customized cross-validation result
        #--------------------------------
        cv_result = lgb.cv(params = params,
                           train_set = dtrain,
                           num_boost_round = 1,
                           folds = gfolds.split(X=unique_vis, y=unique_vis, groups=unique_vis),
                           feval = customized_eval(data = item_id_for_customized_f1, threshold = 0.5, verbose = False),
                           early_stopping_rounds=150,
                           stratified=False,
                           nfold=n_splits)
        
    else:
        cv_result = lgb.cv(params = params,
                           train_set = dtrain,
                           num_boost_round = 10000,
                           feval = customized_eval(data = item_id_for_f1, threshold = 0.5, verbose = False),
                           early_stopping_rounds=100,
                           stratified=False,
                           nfold=n_splits)
    return cv_result['f1-score-on-sentence-level-mean'][-1]

In [31]:
#--------------------
# setting
#--------------------

group_fold = True
target_col = 'label'
n_splits = 2
group_by = 'item_id'
data = df
fit_params = {
"objective": 'binary',
"metric": 'None', # Please remember do specify metric == None for using custom evaluation metrci to do early stopping. 
# "num_leaves" : int(31),
# "max_depth" : int(1),
# "lambda_l2" : 1,
# "lambda_l1" : 1,
# "num_threads" : 4,
# "min_child_samples" : int(1),
# "learning_rate" : 0.03,
# "bagging_fraction" : 1,
# "feature_fraction" : 1,
# "subsample_freq" : 5,
# "bagging_seed" : 42,
# "verbosity" : -1
}



In [66]:
use_which_model = 'lgb'
col_need_for_computing_f1 = ['item_id']
cv_result = []
#--------------------
# CV
#--------------------


unique_vis = np.array(sorted(data[group_by].astype(str).unique()))
folds = GroupKFold(n_splits)
ids = np.arange(data.shape[0]) # index of row for whole data

fold_ids = [] # saving training and validating index of row for each fold
for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
    # trn_vis: 1-D array with index of training row
    # val_vis: 1-D array with index of validating row
    fold_ids.append([
            ids[data[group_by].astype(str).isin(unique_vis[trn_vis])],
            ids[data[group_by].astype(str).isin(unique_vis[val_vis])]
        ])

for fold_id, (trn_idx, val_idx) in enumerate(fold_ids):
    #---------------------
    # train-val split
    #---------------------
    devel = data[features].iloc[trn_idx]
    y_devel = data[target_col].iloc[trn_idx]
    valid = data[features].iloc[val_idx]
    y_valid = data[target_col].iloc[val_idx]

    # for custom_f1: 1-D array with shape of (num_samples,), which each element is item_id
    item_id_for_f1 = data[col_need_for_computing_f1].iloc[val_idx].values.reshape(len(val_idx))                 
    print("Fold ", fold_id, ":")            
    #--------------------
    # covert pd.DataFrame into lgb.Dataset
    #--------------------

    if use_which_model == 'lgb':
        dtrain = lgb.Dataset(devel, label= y_devel, free_raw_data = False)
        dvalid = lgb.Dataset(valid, label= y_valid, free_raw_data = False, reference= dtrain)

        evals_result = {} # for saving the evaluation metric of validating set during training
        model = lgb.train(params = fit_params, 
                              train_set = dtrain, 
                              num_boost_round = 10,
                              valid_sets = dvalid, 
                              evals_result = evals_result,
                              feval = customized_eval(data = item_id_for_f1,threshold = 0.5, verbose = False), 
                             )
        res_score_ls = evals_result['valid_0']['f1-score-on-sentence-level']
        cv_result.append(max(res_score_ls))
mean_cv_result = np.mean(cv_result)
print ('mean_cv_result',mean_cv_result)

Fold  0 :
[1]	valid_0's f1-score-on-sentence-level: 0
[2]	valid_0's f1-score-on-sentence-level: 0
[3]	valid_0's f1-score-on-sentence-level: 0
[4]	valid_0's f1-score-on-sentence-level: 0
[5]	valid_0's f1-score-on-sentence-level: 0
[6]	valid_0's f1-score-on-sentence-level: 0.558525
[7]	valid_0's f1-score-on-sentence-level: 0.738095
[8]	valid_0's f1-score-on-sentence-level: 0.868014
[9]	valid_0's f1-score-on-sentence-level: 0.898331
[10]	valid_0's f1-score-on-sentence-level: 0.901428
Fold  1 :
[1]	valid_0's f1-score-on-sentence-level: 0
[2]	valid_0's f1-score-on-sentence-level: 0
[3]	valid_0's f1-score-on-sentence-level: 0
[4]	valid_0's f1-score-on-sentence-level: 0
[5]	valid_0's f1-score-on-sentence-level: 0
[6]	valid_0's f1-score-on-sentence-level: 0.538222
[7]	valid_0's f1-score-on-sentence-level: 0.741328
[8]	valid_0's f1-score-on-sentence-level: 0.865926
[9]	valid_0's f1-score-on-sentence-level: 0.895096
[10]	valid_0's f1-score-on-sentence-level: 0.9026
mean_cv_result 0.9020143788371