In [1]:
import numpy as np 
import pandas as pd
import os
import gc
import re

# feature libraries
from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing
from itertools import combinations
from sklearn.preprocessing import StandardScaler

# timer function
import time
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))
    
start_time = time.time()


In [2]:
data_dir = '../input/kaggle_data'
feat_dir = '../input/features'

In [21]:
num_cols = []
cat_cols = []

with timer("load data:"):
    usecols = ['item_id']
    train = pd.read_csv(f'{data_dir}/train.csv', index_col="item_id", usecols=usecols+['deal_probability'])
    test = pd.read_csv(f'{data_dir}/test.csv', index_col="item_id", usecols=usecols)

    train_split = len(train)
    y = train['deal_probability'].copy()
    train.drop("deal_probability",axis=1, inplace=True)
    
    gc.collect()
    
with timer("Loading Label Encoded Features:"):
    train_le = pd.read_csv(f'{feat_dir}/train_le.csv', index_col="item_id")
    test_le = pd.read_csv(f'{feat_dir}/test_le.csv', index_col="item_id")
    
    train = train.merge(train_le, how='left', left_index=True, right_index=True)
    test = test.merge(test_le, how='left', left_index=True, right_index=True)
    
    cat_cols += ['region','city','parent_category_name','category_name','user_type','param_1','param_2','param_3']
    
    del train_le, test_le
    gc.collect()
    
with timer("Loading Numeric Encoded Features:"):
    train_numeric = pd.read_csv(f'{feat_dir}/train_numeric.csv', index_col="item_id")
    test_numeric = pd.read_csv(f'{feat_dir}/test_numeric.csv', index_col="item_id")
    
    train = train.merge(train_numeric, how='left', left_index=True, right_index=True)
    test = test.merge(test_numeric, how='left', left_index=True, right_index=True)
    
    cat_cols += ['price_missing','image_top_1_missing']
    num_cols += ['item_seq_number','image_top_1','price']

    del train_numeric, test_numeric
    gc.collect()
    
with timer("Loading NIMA Features:"):
    train_nima = pd.read_csv(f'{feat_dir}/train_nima.csv', index_col="item_id")
    test_nima = pd.read_csv(f'{feat_dir}/test_nima.csv', index_col="item_id")
    
    train = train.merge(train_nima, how='left', left_index=True, right_index=True)
    test = test.merge(test_nima, how='left', left_index=True, right_index=True)
    
    num_cols += ["mobile_mean", "mobile_std","inception_mean", "inception_std", "nasnet_mean", "nasnet_std"]

    del train_nima, test_nima
    gc.collect() 
    
with timer("Loading Image Features:"):
    train_img = pd.read_csv(f'{feat_dir}/train_img.csv', index_col="item_id")
    test_img = pd.read_csv(f'{feat_dir}/test_img.csv', index_col="item_id")
    
    train = train.merge(train_img, how='left', left_index=True, right_index=True)
    test = test.merge(test_img, how='left', left_index=True, right_index=True)
    
    num_cols += ['img_size_x','img_size_y','img_file_size','img_mean_color','thing1','thing2',
                 'img_sobel00','img_sobel10','img_sobel20','img_sobel01','img_sobel11','img_sobel21',
                 'img_kurtosis','img_skew','img_dullness_light_percent','img_dullness_dark_percent','img_blur',
                 'img_blue_mean','img_green_mean','img_red_mean','img_blue_std','img_green_std','img_red_std',
                 'img_average_red','img_average_green','img_average_blue']

    del train_img, test_img
    gc.collect()
    
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

[load data:] done in 5 s
[Loading Label Encoded Features:] done in 3 s
[Loading Numeric Encoded Features:] done in 3 s
[Loading NIMA Features:] done in 4 s
[Loading Image Features:] done in 17 s


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.4, random_state=0)

mdl = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=0)
mdl.fit(X_train, y_train)

for name, importance in zip(list(X_train), mdl.feature_importances_):
    print(name, "=", importance)

del X_train, X_test, y_train, y_test 
gc.collect()

# region = 0.05078480904906774 
# city = 0.0669028031982457 ###
# parent_category_name = 0.01892913897631363
# category_name = 0.018579901012128967
# param_1 = 0.049909490828195055
# param_2 = 0.03443197725200006
# param_3 = 0.013801396359618488
# user_type = 0.009652674217647102
# item_seq_number = 0.09532802580719614 ###
# image_top_1 = 0.11236131987282504 ###
# price_missing = 0.003581317998232572
# image_top_1_missing = 0.016743076249108438
# price = 0.07502286721506463 ###
# mobile_mean = 0.07310568363879322 ###
# mobile_std = 0.07459486368530469 ###
# inception_mean = 0.06958129445962614 ###
# inception_std = 0.0731361226167795 ###
# nasnet_mean = 0.06996479172642771 ###
# nasnet_std = 0.07358844583742533 ###

In [5]:
with timer("Create potential interactions:"):
    
    important_cols = ['image_top_1','price', 'city', 'item_seq_number', 'mobile_mean', 'mobile_std', 'inception_mean',
                      'inception_std', 'nasnet_mean', 'nasnet_std']
    
    def interaction_features(train, test, fea1, fea2, prefix):
        train['inter_{}*'.format(prefix)] = train[fea1] * train[fea2]
        train['inter_{}/'.format(prefix)] = train[fea1] / train[fea2]

        test['inter_{}*'.format(prefix)] = test[fea1] * test[fea2]
        test['inter_{}/'.format(prefix)] = test[fea1] / test[fea2]

        return train, test

    for e, (x, y) in enumerate(combinations(important_cols, 2)):
        train, test = interaction_features(train, test, x, y, e)
        
    inter_cols = [x for x in list(train) if 'inter' in x]

[Create potential interactions:] done in 2 s


In [6]:
with timer("Subset numeric features:"):
    train_num = train[[x for x in list(train) if x in num_cols]]
    test_num = test[[x for x in list(test) if x in num_cols]]

[Subset numeric features:] done in 2 s


In [7]:
train_num.head()

Unnamed: 0,item_seq_number,image_top_1,price,mobile_mean,mobile_std,inception_mean,inception_std,nasnet_mean,nasnet_std,img_size_x,...,img_blur,img_blue_mean,img_green_mean,img_red_mean,img_blue_std,img_green_std,img_red_std,img_average_red,img_average_green,img_average_blue
0,-0.136324,-0.152807,-0.005147,0.376462,0.19665,0.038229,0.360435,-0.071322,0.059919,-0.15126,...,-0.34549,-0.280614,-0.638004,0.687033,0.641418,0.851439,0.132975,-0.280614,-0.638004,0.687033
1,-0.133284,-0.472375,-0.005101,0.380049,0.214786,0.588905,0.167156,0.240389,-0.033727,-0.138074,...,0.275917,-0.608886,-0.394327,-0.266605,0.451465,0.240082,0.133307,-0.608886,-0.394327,-0.266605
2,-0.135073,1.894048,-0.005083,0.542563,0.0953,0.846048,0.241327,0.945804,0.401474,0.072903,...,-0.254716,1.487169,1.379419,1.1409,1.906005,1.904254,1.820077,1.487169,1.379419,1.1409
3,-0.085533,-0.367201,-0.005115,0.44085,0.415528,0.831658,0.253381,0.486166,0.321294,-0.138074,...,-0.372882,2.253272,2.131517,1.853883,0.683302,0.762878,0.720346,2.253272,2.131517,1.853883
4,-0.136146,1.117376,-0.004443,0.051788,0.310443,0.310625,0.161061,0.436753,0.278871,1.707973,...,-0.189964,0.175604,0.376266,0.050487,0.446751,0.359739,0.19153,0.175604,0.376266,0.050487


In [8]:
with timer("Create item description feature:"):
    count = 0
    desc_cat = ['category_name', 'param_1', 'param_2', 'param_3']
    for c in desc_cat:
        if count == 0:
            train['item_desc_cat'] = train[c].astype(str)
            count += 1
        else:
            train['item_desc_cat'] += '_' + train[c].astype(str)

    count = 0
    for c in desc_cat:
        if count == 0:
            test['item_desc_cat'] = test[c].astype(str)
            count += 1
        else:
            test['item_desc_cat'] += '_' + test[c].astype(str)

[Create item description feature:] done in 4 s


In [9]:
with timer("Create user description feature:"):
    count = 0
    desc_cat = ['user_type', 'region', 'city']
    for c in desc_cat:
        if count == 0:
            train['user_desc_cat'] = train[c].astype(str)
            count += 1
        else:
            train['user_desc_cat'] += '_' + train[c].astype(str)

    count = 0
    for c in desc_cat:
        if count == 0:
            test['user_desc_cat'] = test[c].astype(str)
            count += 1
        else:
            test['user_desc_cat'] += '_' + test[c].astype(str)

[Create user description feature:] done in 3 s


In [10]:
with timer("Create categorical count feature:"):
    cat_count_features = []
    for c in cat_cols + ['item_desc_cat'] + ['user_desc_cat']:
        d = pd.concat([train[c],test[c]]).value_counts().to_dict()
        train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
        test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
        cat_count_features.append('%s_count'%c)

[Create categorical count feature:] done in 9 s


In [11]:
train[cat_count_features].head()

Unnamed: 0,region_count,city_count,parent_category_name_count,category_name_count,user_type_count,param_1_count,param_2_count,param_3_count,price_missing_count,image_top_1_missing_count,item_desc_cat_count,user_desc_cat_count
0,127883,85993,914200,135280,1433965,8634,887771,1168896,1895915,1856665,8634,61301
1,99914,57398,243733,87217,1433965,38156,887771,1168896,1895915,1856665,6298,39989
2,116667,68019,231290,36014,1433965,2876,887771,1168896,1895915,1856665,2876,40142
3,111319,21135,914200,135280,467198,6873,887771,1168896,1895915,1856665,6873,5609
4,64444,43920,109792,85101,1433965,83124,26463,2562,1895915,1856665,2562,29263


In [12]:
train_list = [train_num.replace([np.inf, -np.inf, np.nan], 0), train[cat_count_features]]
test_list = [test_num.replace([np.inf, -np.inf, np.nan], 0), test[cat_count_features]]

In [13]:
with timer("Create aggregated features:"):
    def proj_num_on_cat(train_df, test_df, target_column, group_column):
        """
        :param train_df: train data frame
        :param test_df:  test data frame
        :param target_column: name of numerical feature
        :param group_column: name of categorical feature
        """
        train_df['row_id'] = range(train_df.shape[0])
        test_df['row_id'] = range(test_df.shape[0])
        train_df['train'] = 1
        test_df['train'] = 0
        all_df = train_df[['row_id', 'train', target_column, group_column]].append(test_df[['row_id','train',
                                                                                            target_column, group_column]])
        grouped = all_df[[target_column, group_column]].groupby(group_column)
        the_size = pd.DataFrame(grouped.size()).reset_index()
        the_size.columns = [group_column, '%s_size' % target_column]
        the_mean = pd.DataFrame(grouped.mean()).reset_index()
        the_mean.columns = [group_column, '%s_mean' % target_column]
        the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
        the_std.columns = [group_column, '%s_std' % target_column]
        the_median = pd.DataFrame(grouped.median()).reset_index()
        the_median.columns = [group_column, '%s_median' % target_column]
        the_stats = pd.merge(the_size, the_mean)
        the_stats = pd.merge(the_stats, the_std)
        the_stats = pd.merge(the_stats, the_median)

        the_max = pd.DataFrame(grouped.max()).reset_index()
        the_max.columns = [group_column, '%s_max' % target_column]
        the_min = pd.DataFrame(grouped.min()).reset_index()
        the_min.columns = [group_column, '%s_min' % target_column]

        the_stats = pd.merge(the_stats, the_max)
        the_stats = pd.merge(the_stats, the_min)

        all_df = pd.merge(all_df, the_stats, how='left')

        selected_train = all_df[all_df['train'] == 1]
        selected_test = all_df[all_df['train'] == 0]
        selected_train.sort_values('row_id', inplace=True)
        selected_test.sort_values('row_id', inplace=True)
        selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
        selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)

        selected_train, selected_test = np.array(selected_train), np.array(selected_test)
        print(selected_train.shape, selected_test.shape)
        return selected_train, selected_test
    
    for t in ['image_top_1','price', 'item_seq_number', 'mobile_mean', 'mobile_std', 'inception_mean',
                      'inception_std', 'nasnet_mean', 'nasnet_std']:
        for g in ['image_top_1','price', 'item_seq_number', 'mobile_mean', 'mobile_std', 'inception_mean',
                  'inception_std', 'nasnet_mean', 'nasnet_std', 'city']:
            if t != g:
                s_train, s_test = proj_num_on_cat(train, test, target_column=t, group_column=g)
                train_list.append(s_train)
                test_list.append(s_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)
(1503424, 6) (508438, 6)


In [14]:
X = hstack(train_list).tocsr()
X_test = hstack(test_list).tocsr()

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# model
import lightgbm as lgb

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=23)

dtrain = lgb.Dataset(X_train, label=y_train)  
dvalid = lgb.Dataset(X_valid, label=y_valid)  

from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from functools import partial
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

def input2params(max_depth=-1, num_leaves=31, min_data_in_leaf=20, lambda_l1=0, lambda_l2=0,
                 feature_fraction=1.0, bagging_fraction=1.0): #scale_pos_weight=0.5,
    """parse BO input into an lgb friendly form"""
  
    def np2m1(x):
        """nearest power of two minus one"""
        return max(int(round(2**np.round(np.log2(x)) - 1)), 1)
    
    return {
        'objective': "regression",
        'metric': 'rmse', 
        'boosting_type': "gbdt",
        'learning_rate': 0.1,
    
        'num_leaves': np2m1(num_leaves),
        'min_data_in_leaf': int(min_data_in_leaf),

        'feature_fraction': np.clip(feature_fraction, 0.0, 1.0),
        'bagging_fraction': np.clip(bagging_fraction, 0.0, 1.0),
      
        'lambda_l1': max(lambda_l1, 0),
        'lambda_l2': max(lambda_l2, 0),   
    
#         'scale_pos_weight': int(scale_pos_weight),
    }  

def lgb_cv(dtrain, dvalid, **kwargs):
    
    params = input2params(**kwargs)

    # fit model on training data    
    bst = lgb.train(
        params = params,
        train_set = dtrain, 
        valid_sets = [dtrain, dvalid],
        num_boost_round = 10000, 
        early_stopping_rounds = 25,
        verbose_eval=False
    )
    bo_score = np.mean(bst.best_score['valid_1']['rmse']) * -1
    
#     f, ax = plt.subplots(figsize=[7,10])
#     lgb.plot_importance(bst, max_num_features=50, ax=ax)
#     plt.title("Light GBM Feature Importance")
#     plt.show() 
#     return np.mean(bst.best_score['valid_1']['rmse'])

    return bo_score

param_lims = {
    'num_leaves': (128, 512),
    'min_data_in_leaf': (15, 256),
    'feature_fraction': (0.1, 1.0),
    'bagging_fraction': (0.1, 1.0),
    'lambda_l1': (0.0, 1.0),
    'lambda_l2': (0.0, 1.0),
#     'scale_pos_weight': (50, 450),
}

param_log = []

lgb_opt = partial(lgb_cv, dtrain, dvalid)
BO = BayesianOptimization(lgb_opt, param_lims)
BO.maximize(init_points=5, n_iter=5)

# param_log.append({**BO.res['max']['max_params']})    
      
