In [1]:
import pandas as pd
import lightgbm as lgb
import category_encoders as ce
import functools
from sklearn.cross_validation import KFold
import xgboost as xgb
import catboost
from sklearn.preprocessing import LabelEncoder
import datetime



In [2]:
train_df = pd.read_csv(
    filepath_or_buffer='/home/data/avito-demand-prediction/train.csv', 
    usecols=['item_id', 'user_id', 'region', 'city', 'parent_category_name', 'category_name', 'param_1',
            'param_2', 'param_3', 'price', 'item_seq_number', 'user_type', 'image_top_1', 'deal_probability'],
    nrows=10000,
    index_col='item_id'
)
train_df.fillna({
        'param_1': '<UNKNOWN>',
        'param_2': '<UNKNOWN>',
        'param_3': '<UNKNOWN>'
    }, inplace=True)

In [3]:
train_df.shape

(10000, 13)

In [4]:
train_df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,image_top_1,deal_probability
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,<UNKNOWN>,<UNKNOWN>,400.0,2,Private,1008.0,0.12789
2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,<UNKNOWN>,<UNKNOWN>,3000.0,19,Private,692.0,0.0
ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",<UNKNOWN>,<UNKNOWN>,4000.0,9,Private,3032.0,0.43177
02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,<UNKNOWN>,<UNKNOWN>,2200.0,286,Company,796.0,0.80323
7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,40000.0,3,Private,2264.0,0.20797


In [5]:
def encode_categorical(df, columns_to_encode):
    
    encoders = {}
    
    for c in columns_to_encode:
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])
        encoders[c] = le
        
    return df, encoders

In [6]:
def get_ids_by_folds():
    folds = [
        (list(train_df.iloc[train].index), list(train_df.iloc[valid].index)) 
        for (train, valid) in KFold(train_df.shape[0], n_folds=5, shuffle=True)
    ]
    
    return folds

In [7]:
FOLDS = get_ids_by_folds()

In [8]:
CATEGORICAL_FEATURES = ['user_id', 'region', 'city', 'parent_category_name', 'category_name', 'param_1', 
                        'param_2', 'param_3', 'user_type', 'image_top_1']

In [9]:
df, LABEL_ENCODERS = encode_categorical(train_df, columns_to_encode=CATEGORICAL_FEATURES)

In [10]:
df.head()

Unnamed: 0_level_0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,image_top_1,deal_probability
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
b912c3c6a6ad,8471,19,147,4,41,167,4,87,400.0,2,1,654,0.12789
2dac0150717d,2185,17,429,2,22,81,4,87,3000.0,19,1,454,0.0
ba83aefab5dc,5440,16,420,0,2,49,4,87,4000.0,9,1,1867,0.43177
02996f1dd2ea,7178,21,314,4,41,31,4,87,2200.0,286,0,527,0.80323
7c90be56d2ab,9068,4,97,6,0,189,48,17,40000.0,3,1,1423,0.20797


In [11]:
TARGET_ENCODING_GROUPPING = [
    ["region"],
    ["region", "user_type"],
    ["image_top_1"],
    ["parent_category_name"],
]

COLS_TO_USE_FOR_TE = list(
    functools.reduce(
        lambda x1, x2: set(x1) | set(x2), 
        TARGET_ENCODING_GROUPPING
    )
)

TRAIN_FEATURES = [
    'user_id', 'region', 'city', 'parent_category_name', 'category_name', 'param_1',
    'param_2', 'param_3', 'price', 'item_seq_number', 'user_type', 'image_top_1'
]

In [12]:
def internal_validation_step(df, y, train_indices, n_fold=3):
    
    internal_folds = KFold(len(train_indices), n_folds=n_fold, random_state=42)
    
    target_encoded = []
    
    for internal_fold_train, internal_fold_valid in internal_folds:
        
        df_tr = df[COLS_TO_USE_FOR_TE]\
            .loc[[train_indices[i] for i in internal_fold_train]]\
            .copy(deep=True)
        df_te = df[COLS_TO_USE_FOR_TE]\
            .loc[[train_indices[i] for i in internal_fold_valid]]\
            .copy(deep=True)
        
        for t in TARGET_ENCODING_GROUPPING:
            
            te = ce.TargetEncoder(cols=t, smoothing=100)\
                .fit(df_tr, y[[train_indices[i] for i in internal_fold_train]])
            df_te = te.transform(df_te)

        target_encoded.append(df_te)
        
    target_features = pd.concat(target_encoded)
        
    return target_features

In [13]:
def lightgbm_dataset_getter(X, y):
    return lgb.Dataset(X, y, free_raw_data=False)

def xgboost_dataset_getter(X, y):
    return X, y

def catboost_dataset_getter(X, y, categorical_features):
    return catboost.Pool(X, y, cat_features=categorical_features)


In [14]:
def fit_boosting(
    df, y, train_indices, val_indices, fold_number, boosting_mode, boosting_params=None, 
    use_target_encoded_features=True
):
    train_dataset = df[TRAIN_FEATURES].loc[train_indices]
    valid_dataset = df[TRAIN_FEATURES].loc[val_indices]
    
    if use_target_encoded_features:

        target_encoded_features = internal_validation_step(df, y, train_indices)
        train_dataset = train_dataset.join(target_encoded_features.loc[train_indices], rsuffix='_te')
        
        cols_to_use_for_target_encoding = list(
            functools.reduce(
                lambda x1, x2: set(x1) | set(x2), 
                TARGET_ENCODING_GROUPPING
            )
        )

        df_te = df[COLS_TO_USE_FOR_TE]\
            .loc[val_indices]\
            .copy(deep=True)

        for t in TARGET_ENCODING_GROUPPING:
            te = ce.TargetEncoder(cols=t, smoothing=100).\
                fit(
                    df[COLS_TO_USE_FOR_TE].loc[train_indices], 
                    y[train_indices]
                )
            df_te = te.transform(df_te)

        valid_dataset = valid_dataset.join(df_te, rsuffix='_te')

    cols = sorted(list(train_dataset.columns))
    
    assert sorted(list(train_dataset.columns)) == sorted(list(valid_dataset.columns))
    
    if boosting_mode == 'lightgbm':
        
        train_dataset = lightgbm_dataset_getter(train_dataset[cols], y.loc[train_indices])
        valid_dataset = lightgbm_dataset_getter(valid_dataset[cols], y.loc[val_indices])
        
        gbm = lgb.train(
            boosting_params,
            train_dataset,
            num_boost_round=10000,
            early_stopping_rounds=200,
            valid_sets=(train_dataset, valid_dataset),
            valid_names=('train', 'valid'),
            feature_name=cols,
            categorical_feature=CATEGORICAL_FEATURES,
            verbose_eval=100
        )
        
    elif boosting_mode == 'catboost':
        
        cat_features = [cols.index(i) for i in CATEGORICAL_FEATURES]
        
        train_dataset = catboost_dataset_getter(train_dataset[cols], y.loc[train_indices], cat_features)
        valid_dataset = catboost_dataset_getter(valid_dataset[cols], y.loc[val_indices], cat_features)
        
        gbm = catboost.CatBoostClassifier(**boosting_params)
        
        gbm.fit(train_dataset, eval_set=valid_dataset, plot=True)
        
    elif boosting_mode == 'xgboost':
        
        train_dataset = xgboost_dataset_getter(train_dataset[cols], y.loc[train_indices])
        valid_dataset = xgboost_dataset_getter(valid_dataset[cols], y.loc[val_indices])
        
        gbm = xgb.XGBRegressor(**boosting_params)
    
        gbm.fit(
            train_dataset[0], train_dataset[1], early_stopping_rounds=100, eval_metric='rmse',
            eval_set=[valid_dataset]
        )
    else:
        raise ValueError('Not supported {} boosting implementation!'.format(boosting_mode)) 
    
    return gbm, cols

In [15]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'xentropy',
    'metric': ['rmse', 'xentropy'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'cat_smooth': 150,
    'num_threads': 8
}
    
catboost_params = {
    'iterations': 10000,
    'learning_rate': 0.05,
    'loss_function': 'CrossEntropy',
    'custom_metric': ['RMSE'],
    'eval_metric': 'RMSE',
    'subsample': 0.6,
    'logging_level': 'Verbose',
    'metric_period': 100,
    'depth': 5,
    'rsm': 0.6,
    'thread_count': 8,
    'od_type': 'Iter',
    'od_wait': 200,
    'use_best_model': True,
    'bootstrap_type': 'Bernoulli'
}
    
xgb_params = {
    'learning_rate': 0.2,
    'max_depth': 5,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'objective': 'reg:logistic',
    'n_estimators': 10000,
    'nthread': 8,
    'silent': False
}

In [17]:
clfs = []
for ind, (train_inds, val_inds) in enumerate(FOLDS):
    print('Fold {} started at {}'.format(ind, datetime.datetime.now()))
    clf, _ = fit_boosting(
        df=df, y=df.deal_probability, train_indices=train_inds, val_indices=val_inds, fold_number=ind, 
        boosting_mode='lightgbm', boosting_params=params, use_target_encoded_features=True
    )
    clfs.append(clf)
    print('Fold {} finished at {}\n\n\n'.format(ind, datetime.datetime.now()))

Fold 0 started at 2018-07-13 23:41:02.861412




Training until validation scores don't improve for 200 rounds.
[100]	train's xentropy: 0.332427	train's rmse: 0.221003	valid's xentropy: 0.363779	valid's rmse: 0.234333
[200]	train's xentropy: 0.308982	train's rmse: 0.204225	valid's xentropy: 0.367326	valid's rmse: 0.23739
Early stopping, best iteration is:
[44]	train's xentropy: 0.350217	train's rmse: 0.231359	valid's xentropy: 0.36309	valid's rmse: 0.23316
Fold 0 finished at 2018-07-13 23:41:33.257564



Fold 1 started at 2018-07-13 23:41:33.257712
Training until validation scores don't improve for 200 rounds.
[100]	train's xentropy: 0.330938	train's rmse: 0.218465	valid's xentropy: 0.380755	valid's rmse: 0.250356
[200]	train's xentropy: 0.30841	train's rmse: 0.2026	valid's xentropy: 0.38402	valid's rmse: 0.251425
Early stopping, best iteration is:
[84]	train's xentropy: 0.335158	train's rmse: 0.221091	valid's xentropy: 0.379822	valid's rmse: 0.250203
Fold 1 finished at 2018-07-13 23:42:03.482667



Fold 2 started at 2018-07-13 23:42