In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from preprocess import pre_process
import lightgbm as lgbm

In [2]:
train_df, test_df = pre_process()
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

train_df.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,num_rot30_X,num_rot30_Y,num_rot45_X,num_rot45_Y,num_rot60_X,num_rot60_Y,manager_level_low,manager_level_medium,manager_level_high,manager_skill
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,,40.7145,7211212,-73.9425,...,-1.711459,-84.393333,-23.495744,-81.074742,-43.678833,-72.231041,1.0,0.0,0.0,0.0
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,Doorman Elevator Fitness_Center Cats_Allowed D...,40.7947,7150865,-73.9667,...,-1.654103,-84.454391,-23.456146,-81.148564,-43.659691,-72.312597,,,,
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,Laundry_In_Building Dishwasher Hardwood_Floors...,40.7388,6887163,-74.0018,...,-1.720064,-84.456839,-23.520493,-81.133856,-43.718039,-72.281736,,,,


In [15]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

print(train_X.shape, test_X.shape)

(49352, 248) (74659, 248)


In [17]:
# the following dictionary contains most of the relavant hyperparameters for our task
# I haven't tuned them yet, so they are mostly default
t4_params = {
    'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
    'num_leaves': 2**4, 'learning_rate': 0.05, 'max_depth': -1,
    'max_bin': 255, 'subsample_for_bin': 50000,
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.6, 
    'reg_alpha': 1, 'reg_lambda': 0, 'min_split_gain': 0.5, 'min_child_weight': 1, 
    'min_child_samples': 10, 'scale_pos_weight': 1}

# they can be used directly to build a LGBMClassifier (which is wrapped in a sklearn fashion)
t4 = lgbm.sklearn.LGBMClassifier(n_estimators=1000, seed=0, **t4_params)

lgbm_params = t4_params.copy()
lgbm_params['num_class'] = 3
dset = lgbm.Dataset(train_X, train_y, silent=True)
cv_results = lgbm.cv(lgbm_params, dset, num_boost_round=5000, nfold=5, stratified=False, 
                     shuffle=True, metrics='multi_logloss', early_stopping_rounds=100, 
                     verbose_eval=100, show_stdv=True, seed=0)
print 'best n_estimators:', len(cv_results['multi_logloss-mean'])
print 'best cv score:', cv_results['multi_logloss-mean'][-1]

[100]	cv_agg's multi_logloss: 0.560214 + 0.00269646
[200]	cv_agg's multi_logloss: 0.533729 + 0.00382996
[300]	cv_agg's multi_logloss: 0.524797 + 0.00433495
[400]	cv_agg's multi_logloss: 0.520532 + 0.0046693
[500]	cv_agg's multi_logloss: 0.517821 + 0.00472629
[600]	cv_agg's multi_logloss: 0.51615 + 0.00485603
[700]	cv_agg's multi_logloss: 0.515063 + 0.00516048
[800]	cv_agg's multi_logloss: 0.514285 + 0.0052294
[900]	cv_agg's multi_logloss: 0.513791 + 0.005482
[1000]	cv_agg's multi_logloss: 0.513404 + 0.00551978
[1100]	cv_agg's multi_logloss: 0.513186 + 0.00553491
[1200]	cv_agg's multi_logloss: 0.51312 + 0.00553728
best n_estimators: 1165
best cv score: 0.513070099284


In [None]:
from bayes_opt import BayesianOptimization
def lightGBM_evaluate(num_leaves, max_depth, feature_fraction, min_data_in_leaf, lambda_l1, 
                      lambda_l2):
    
    params['num_leaves'] = int(num_leaves)
    params['max_depth'] = int(max_depth)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['min_data_in_leaf'] = int(min_data_in_leaf)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    
    d_train = lgbm.Dataset(train_X, train_y, silent=True)
    cv_result = lgbm.cv(params, dset, num_boost_round=500, nfold=5, stratified=False, 
                     shuffle=True, metrics='multi_logloss')
    return -cv_result.values()[0][-1]

num_rounds = 1000
num_iter = 25
init_points = 5
params = {
    'boosting_type': 'gbdt', 
    'objective': 'multiclass',
    'num_class': 3,
    'nthread': -1, 
    'silent': True, 
    'learning_rate': 0.05}
    
lightgbmBO = BayesianOptimization(lightGBM_evaluate, {'num_leaves': (20, 50),
                                                      'max_depth': (3, 9),
                                                      'feature_fraction': (0.3, 0.8),
                                                      'min_data_in_leaf': (10, 50),
                                                      'lambda_l1': (0, 10),
                                                      'lambda_l2': (0, 10)
                                                     })

lightgbmBO.maximize(init_points=init_points, n_iter=num_iter)

In [18]:
lgbm_params['num_leaves'] = 47
lgbm_params['max_depth'] = 9
lgbm_params['feature_fraction'] = 0.61
lgbm_params['min_data_in_leaf'] = 26
lgbm_params['lambda_l1'] = 0.67
lgbm_params['lambda_l2'] = 9.87

sumPred = np.zeros((test_X.shape[0], 3))
for i in range(20):
    print 'Start '+str(i+1)+' times of training'
    clr = train_lightGBM(train_X, train_y, lgbm_params)
    sumPred += predict_lightGBM(clr, test_X)
    print 'Iteration: '+str(clr.best_iteration)+'  Multi-logloss: '+str(clr.best_score.values()[0]['multi_logloss'])
preds = sumPred/20

Start 1 times of training
Iteration: 573  Multi-logloss: 0.505156683943
Start 2 times of training
Iteration: 686  Multi-logloss: 0.509207734495
Start 3 times of training
Iteration: 671  Multi-logloss: 0.50536290487
Start 4 times of training
Iteration: 660  Multi-logloss: 0.498159428362
Start 5 times of training
Iteration: 599  Multi-logloss: 0.516607847713
Start 6 times of training
Iteration: 721  Multi-logloss: 0.499775018082
Start 7 times of training
Iteration: 676  Multi-logloss: 0.530587526307
Start 8 times of training
Iteration: 608  Multi-logloss: 0.504733151736
Start 9 times of training
Iteration: 729  Multi-logloss: 0.510426023964
Start 10 times of training
Iteration: 645  Multi-logloss: 0.512736003231
Start 11 times of training
Iteration: 643  Multi-logloss: 0.503403824329
Start 12 times of training
Iteration: 597  Multi-logloss: 0.506026917423
Start 13 times of training
Iteration: 786  Multi-logloss: 0.523246737762
Start 14 times of training
Iteration: 640  Multi-logloss: 0.4

In [19]:
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("lightGBM_f.csv", index=False)