In [1]:
import pandas as pd
import lightgbm as lgb
import sys
sys.path.append('../models/blend/')

In [2]:
#---------------------
# setting
#---------------------
BINARY_SCENARIO = None
#---------------------
# load features
#---------------------
feature_dir = '../features/lazada_and_amazon/all_features.h5'
df = pd.read_hdf(feature_dir)
#---------------------
# label post-processing
#---------------------
if df.label.nunique() == 2: 
    BINARY_SCENARIO = True
    # binary class
    df['label'] = df.label.apply(lambda x: 1 if x == 2 else 0) # for customized f1 score inference of lgb
else:
    # multi-class(B, I or O)
    pass

In [3]:
df.shape

(238668, 575)

In [4]:
df.head()

Unnamed: 0,item_name,tokens,label,is_valid,clean_tokens,item_id,word_id,if_tokens_has_numbers_in_the_str,if_start_with_capital_chars,percentage_of_upper_chars_in_token,...,dim_292,dim_293,dim_294,dim_295,dim_296,dim_297,dim_298,dim_299,dim_300,tf_idf
0,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Samsung,1,train,samsung,1,7054,0,1,0.142857,...,-1.158214,-3.077862,4.290862,4.385282,4.017592,3.681826,-4.147452,7.35163,4.044105,0.169688
1,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Galaxy,0,train,galaxy,1,537,0,1,0.166667,...,-0.766951,-3.085364,4.41657,2.265839,4.203361,0.191773,-1.871406,9.093953,2.210504,0.194254
2,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,J1,0,train,j1,1,10138,1,1,0.5,...,-2.633618,1.691233,-3.085515,-1.096208,0.355705,0.258805,-0.344883,0.036787,1.097038,0.377184
3,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,Ace,0,train,ace,1,10469,0,1,0.333333,...,2.824487,0.047915,-1.890348,0.10357,0.44959,1.80883,0.450412,0.646871,0.522857,0.402201
4,Samsung Galaxy J1 Ace 2016 SM-J111F 8GB - White,2016,0,train,2016,1,9108,1,0,0.0,...,-0.603993,0.499419,0.748781,1.345417,-1.62176,0.5303,0.549422,1.383292,1.529812,0.362081


In [5]:
df.label.unique() #=====> binary target

array([1, 0])

In [6]:
df.is_valid.unique()

array(['train', 'val'], dtype=object)

In [7]:
df.columns.tolist()[:]

['item_name',
 'tokens',
 'label',
 'is_valid',
 'clean_tokens',
 'item_id',
 'word_id',
 'if_tokens_has_numbers_in_the_str',
 'if_start_with_capital_chars',
 'percentage_of_upper_chars_in_token',
 'check_if_english_word',
 'len_of_token',
 'is_all_character_captilized',
 'is_all_character_lowercase',
 'consist_only_of_digits',
 'is_first_character_digit',
 'is_first_character_uppercase',
 'do_consist_hyphen',
 'if_it_is_and',
 'is_second_character_uppercase',
 'if_it_is_a_sale_word',
 'if_it_is_by',
 'the_preceding_w_1-hasNumbers',
 'the_preceding_w_1-consist_only_of_digits',
 'the_preceding_w_1-do_consist_hyphen',
 'the_preceding_w_1-if_it_is_and',
 'the_preceding_w_1-if_start_with_capital_chars',
 'the_preceding_w_1-percentage_of_upper_chars_in_token',
 'the_preceding_w_1-check_if_english_word',
 'the_preceding_w_1-len_of_token',
 'the_preceding_w_1-is_all_character_captilized',
 'the_preceding_w_1-is_all_character_lowercase',
 'the_preceding_w_1-is_first_character_digit',
 'the_pre

# Out-of-Fold Validation

In [8]:
from sklearn.model_selection import GroupKFold
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from utils import customized_eval


class KFoldValidation():
    
    def __init__(self, data, group_by = 'item_id',n_splits=5):
        ''''''
        unique_vis = np.array(sorted(data[group_by].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0]) # index of row for whole data
        
        self.fold_ids = [] # saving training and validating index of row for each fold
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            # trn_vis: 1-D array with index of training row
            # val_vis: 1-D array with index of validating row
            self.fold_ids.append([
                    ids[data[group_by].astype(str).isin(unique_vis[trn_vis])],
                    ids[data[group_by].astype(str).isin(unique_vis[val_vis])]
                ])

    
    def validate(self, train, test, features, target_col, use_which_model ='lgb', 
                 name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "rmse"}):
        '''
        test: Only needed, if prepare_stacking is True.
        name: Only needed, if prepare_stacking is True.
        target_col: str.
        '''
        col_need_for_computing_f1 = ['item_id']
        
        #----------------------
        # initialization
        #----------------------
#         model.FeatImp = pd.DataFrame(index=features) # Feature Importance 
        full_score = 0 # Final Meaure of Performance
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn_idx, val_idx) in enumerate(self.fold_ids):
            #---------------------
            # train-val split
            #---------------------
            devel = train[features].iloc[trn_idx]
            y_devel = train[target_col].iloc[trn_idx]
            valid = train[features].iloc[val_idx]
            y_valid = train[target_col].iloc[val_idx]
            
            # for custom_f1
            #global val_for_f1
            # get 1-D array with shape of (num_samples,)
            item_id_for_f1 = train[col_need_for_computing_f1].iloc[val_idx].values.reshape(len(val_idx))                
            print("Fold ", fold_id, ":")            
            #--------------------
            # covert pd.DataFrame into lgb.Dataset
            #--------------------
            if use_which_model == 'lgb':
                dtrain = lgb.Dataset(devel, label= y_devel, free_raw_data = False)
                dvalid = lgb.Dataset(valid, label= y_valid, free_raw_data = False, reference= dtrain)
                        
                #evals_result = {} # for saving the evaluation metric of validating set during training
                model = lgb.train(params = fit_params, 
                                      train_set = dtrain, 
                                      num_boost_round = 10,
                                      valid_sets = dvalid, 
                                      #evals_result = evals_result,
                                      feval = customized_eval(data = item_id_for_f1,threshold = 0.5, verbose = True), 
                                     )
            
#             if len(model.feature_importances_) == len(features):  # some bugs in catboost?
#                 model.FeatImp['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()


            #----------------------
            # compute the score of each fold
            #----------------------
            predictions = model.predict(valid) # 1-D array with shape of (num_samples,)
            fold_score = model.best_score['valid_0']['f1-score on sentence-level']
            print("Fold ", fold_id, " f1-score : ", fold_score) 
            
            #----------------------
            # compute final measure of performance(Average)
            #----------------------
           
            full_score += fold_score / len(self.fold_ids) # len(self.fold_ids) == n_splits

            if prepare_stacking:
                train[name].iloc[val] = predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final score: ", full_score)
        return full_score

In [9]:
Kfolder = KFoldValidation(df, group_by = 'item_id', n_splits=5)


In [10]:
len(Kfolder.fold_ids)

5

In [11]:
target_col = 'label'
features = df.columns.tolist()[7:]
# parameters
CPU_USE_RATE = 0.5
NUM_LEAVES = 31
COLSAMPLE_BYTREE = 1.0
SUBSAMPLE = 1.0
SUBSAMPLE_FREQ = 0
MAX_DEPTH = -1
REG_LAMBDA = 0.0
REG_LAMBDA = 0.0
REG_ALPHA = 0.0
MIN_SPLIT_GAIN = 0
MIN_CHILD_WEIGHT = 0.001
MAX_BIN = 255

#features

In [12]:
import sys
sys.path.append('../models/blend/')

In [None]:
import time
#from utils import custom_system_f1
# class OrderedCounter(Counter, OrderedDict):
#     'Counter that remembers the order elements are first encountered'

#     def __repr__(self):
#         return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))

#     def __reduce__(self):
#         return self.__class__, (OrderedDict(self),)
    
# def custom_system_f1(y_pred, y, threshold = 0.5, verbose = False):
#     '''
#     It's a customized evaluation metric for computing f1-score on sentence-level.
    
#     Args:
#     If binary classificiton:
#         y_pred: 1-D array, with shape of (num_sample, ), where each elemeent is prob belong to the class 1.
#         y: same shape as y_pred.
        
#     If multi-class classification:
#         y_pred: 2-D array, with shape of (nun_sample, num_class), where each class is prob belong to this class.
    
#     Return:
#         f1-score on sentence-level instead of token-level.
#     ''' 
#     # get y_true
#     s = time.time()
#     y_true = y.get_label().astype("int")
#     # get y_pred
#     y_pred = np.array([1 if p > threshold else 0 for p in y_pred])
#     # get helper dict
#     id_lengh_dict = OrderedCounter(list(val_for_f1)) # need Counter is ordered key
#     #---------------------
#     # initialization
#     #---------------------
#     ix = 0
#     correct_preds, total_correct, total_preds = 0., 0., 0.

#     for item_id, item_length in id_lengh_dict.items():
#         y_t_sentence = list(y_true[ix: ix + item_length])
#         y_p_sentence = list(y_pred[ix: ix + item_length])
#         #----------
#         # core
#         #----------
#         if all(v == 0 for v in y_t_sentence):
#             pass
#         else:
#             # there is exiting atual y_true
#             total_correct += 1.0
#             if np.array_equal(y_t_sentence, y_p_sentence):
#                 # givne the case that we have actual y_ture and y_ture == y_pred
#                 correct_preds += 1.0
#         if all(v == 0 for v in y_p_sentence):
#             pass
#         else:
#             total_preds += 1.0
#         ix += item_length
#     #----------
#     # output
#     #----------
#     p   = correct_preds / total_preds if correct_preds > 0 else 0
#     r   = correct_preds / total_correct if correct_preds > 0 else 0
#     f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
#     if verbose == True:
#         print('f1: {}'.format(f1))
#         print('precision: {}'.format(p))
#         print('recall: {}'.format(r))
#     return 'f1-score on sentence-level', f1, True

test = None
prepare_stacking = False
# the best para will be determined by bayesian optimization
fit_params = {
              "early_stopping_rounds": 50,
              "verbose": 100, 
              'objective': 'binary',
              "num_leaves":NUM_LEAVES,
              'metric': 'None' # Please remember do specify metric == None for using custom evaluation metrci to do early stopping.
    
              }

Kfolder.validate(df, test, features, target_col,'lgb', "lgbpred", prepare_stacking, fit_params)




Fold  0 :




f1: 0
precision: 0
recall: 0
[1]	valid_0's f1-score on sentence-level: 0
Training until validation scores don't improve for 50 rounds.
f1: 0
precision: 0
recall: 0
[2]	valid_0's f1-score on sentence-level: 0
f1: 0
precision: 0
recall: 0
[3]	valid_0's f1-score on sentence-level: 0
f1: 0
precision: 0
recall: 0
[4]	valid_0's f1-score on sentence-level: 0
f1: 0.0022837567799029405
precision: 1.0
recall: 0.0011431837667905116
[5]	valid_0's f1-score on sentence-level: 0.00228376
f1: 0.5335522265544839
precision: 0.9461426491994177
recall: 0.37153472420691624
[6]	valid_0's f1-score on sentence-level: 0.533552
f1: 0.7407915024841528
precision: 0.9247219846022241
recall: 0.6178908259502716
[7]	valid_0's f1-score on sentence-level: 0.740792
f1: 0.8743121922965538
precision: 0.8861168183152334
recall: 0.8628179479851386
[8]	valid_0's f1-score on sentence-level: 0.874312
f1: 0.8952754801626245
precision: 0.8786461199779857
recall: 0.9125464418405259
[9]	valid_0's f1-score on sentence-level: 0.8952

In [None]:
# params = {
#     'objective': 'multiclass',
#     'num_class':3,
#     #'metric': 'multi_logloss',
#     'metric': 'None' # Please remember do specify metric == None for using custom evaluation metrci to do early stopping.
    
# }

# evals_result = {} # This dictionary used to store all evaluation results of all the items in valid_sets
# num_round = 50000
# lgbmodel = lgb.train(params, 
#                       dtrain, 
#                       num_round, 
#                       early_stopping_rounds = 200,
#                       valid_sets = dvalid, 
#                       evals_result = evals_result,
#                       feval = custom_system_f1)


# helper function

In [None]:
# LGBMClassifier cannot be used 

In [None]:
# import multiprocessing
# from lightgbm import LGBMClassifier

# CPU_USE_RATE = 0.5
NUM_LEAVES = 31
# COLSAMPLE_BYTREE = 1.0
# SUBSAMPLE = 1.0
# SUBSAMPLE_FREQ = 0
# MAX_DEPTH = -1
# REG_LAMBDA = 0.0
# REG_LAMBDA = 0.0
# REG_ALPHA = 0.0
# MIN_SPLIT_GAIN = 0
# MIN_CHILD_WEIGHT = 0.001
# MAX_BIN = 255

# fit_params = {
#               "num_boost_round":15000
#               "early_stopping_rounds": 50,
#               "verbose": 100, 
#               "eval_metric": "multi_logloss",
#               }
# train_parameters = {
#     "num_leaves":NUM_LEAVES
# }
# lgbmodel = LGBMClassifier(
#     n_jobs=int(multiprocessing.cpu_count()*CPU_USE_RATE),
#     n_estimators=10000,
#     learning_rate=0.02,
#     num_leaves=NUM_LEAVES,
#     colsample_bytree=COLSAMPLE_BYTREE,
#     subsample=SUBSAMPLE,
#     subsample_freq=SUBSAMPLE_FREQ,
#     max_depth=MAX_DEPTH,
#     reg_alpha=REG_ALPHA,
#     reg_lambda=REG_LAMBDA,
#     min_split_gain=MIN_SPLIT_GAIN,
#     min_child_weight=MIN_CHILD_WEIGHT,
#     max_bin=MAX_BIN,
#     )


In [None]:
# lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=31, min_child_samples=100,
#                       learning_rate=0.03, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
#                       bagging_seed=2019, subsample=.9, colsample_bytree=.9, use_best_model=True)


# Kfolder.validate(train, test, real_cols + cat_cols, lgbmodel, "lgbpred", prepare_stacking=True)

