In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split

import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

import gc

In [2]:
def automated_params_tuning(train_dataset, cat_features, max_evals, n_folds, sample_train, sample_test, boost_rounds, early_stop):
    # 自動調參 function
    # train_dataset: train dataset(panda.Dataframe)
    # cat_features: categorial features(list)
    # max_evals: evaluation tuning rounds(int)
    # n_folds: num of k-fold cv
    # sample_train: cv train sample size(int)
    # sample_test: cv test sample size(int)
    # boost_rounds: cv boost rounds
    # early_stop: cv early stop rounds
    # return: lightgbm parameters(dict)
    
    # data preprocessing, log(total_price), drop non-number cols
    features = train_dataset.sample(n=sample_train, random_state=31)
    features['total_price'] = np.log1p(features['total_price'])
    labels = np.array(features['total_price']).reshape((-1, ))
    features = features.drop(columns=['total_price', 'building_id'])

    # split dataset into train and test used in cv
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = sample_test, random_state = 31)
    
    train_set = lgb.Dataset(train_features, label = train_labels)
    test_set = lgb.Dataset(test_features, label = test_labels)
    
    # hit score eval function
    def hit_score(preds, train_data):
        trues  = train_data.get_label()
        trues = np.expm1(trues)
        preds = np.expm1(preds)
        scores = ((np.absolute(preds - trues) / trues) <= 0.1)
        hit_score = np.sum(scores) / train_data.num_data()
        return 'Hit_score', hit_score, True
    
    # objective function for optimization
    def objective(hyperparameters):
        # Keep track of evals
        global ITERATION

        ITERATION += 1

        # Using early stopping to find number of trees trained
        if 'n_estimators' in hyperparameters:
            del hyperparameters['n_estimators']

        # Retrieve the subsample
        subsample = hyperparameters['boosting_type'].get('subsample', 1.0)

        # Extract the boosting type and subsample to top level keys
        hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
        hyperparameters['subsample'] = subsample

        # Make sure parameters that need to be integers are integers
        for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
            hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

        start = timer()

        # Perform n_folds cross validation
        cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = boost_rounds, nfold = n_folds, 
                            early_stopping_rounds = early_stop, metrics = 'None', seed = 50,
                            feval=hit_score, stratified=False, categorical_feature=cat_features)

        run_time = timer() - start

        # Extract the best score
        best_score = cv_results['Hit_score-mean'][-1]

        # Loss must be minimized
        loss = 1 - best_score

        # Boosting rounds that returned the highest cv score
        n_estimators = len(cv_results['Hit_score-mean'])

        # Add the number of estimators to the hyperparameters
        hyperparameters['n_estimators'] = n_estimators

        # Write to the csv file ('a' means append)
        of_connection = open(OUT_FILE, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
        of_connection.close()

        # Dictionary with information for evaluation
        return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
                'train_time': run_time, 'status': STATUS_OK}

    # Define the search space
    space = {
        'boosting_type': hp.choice('boosting_type', 
                                    [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)},]
                                  ),
        'num_leaves': hp.quniform('num_leaves', 20, 70, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.03)),
        'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
        'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
        'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    }
    
    # Create a new file and open a connection
    OUT_FILE = 'bayes_test.csv'
    of_connection = open(OUT_FILE, 'w')
    writer = csv.writer(of_connection)

    # Write column names
    headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
    writer.writerow(headers)
    of_connection.close()
    

    # Create the algorithm
    tpe_algorithm = tpe.suggest
    
    # Record results
    trials = Trials()
    
    best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = max_evals)
    
    trials_dict = sorted(trials.results, key = lambda x: x['loss'])
    
    return trials_dict[:1][0]['hyperparameters']

In [3]:
train = pd.read_csv('FE_train_0630.csv')
test  = pd.read_csv('FE_test_0630.csv')

target_df = train.groupby(['city', 'town']).agg({'building_area' : ['mean', 'median'], 'land_area' : ['mean', 'median'], 'total_price' : ['mean', 'median']}).reset_index()
target_df.columns = [i[0] + '_' + i[1]  if i[1] != '' else i[0] for i in target_df.columns.tolist()]
target_df['price_land_rate_median'] = np.log1p(target_df['total_price_median']) / target_df['land_area_median']
target_df['price_building_rate_median'] = np.log1p(target_df['total_price_median']) / target_df['building_area_median']
target_df['price_land_rate_mean'] = np.log1p(target_df['total_price_mean']) / target_df['land_area_mean']
target_df['price_building_rate_mean'] = np.log1p(target_df['total_price_mean']) / target_df['building_area_mean']

combine_cols = ['city', 'town', 'price_land_rate_median', 'price_building_rate_median', 'price_land_rate_mean', 'price_building_rate_mean']
train = pd.merge(train, target_df[combine_cols], on =['city', 'town'], how='left')
test = pd.merge(test, target_df[combine_cols], on =['city', 'town'], how='left')

train.loc[train['building_area'] == 4, 'parking_area'] = train.loc[train['building_area'] == 4, 'building_area'] / train.loc[train['building_area'] == 4, 'total_floor']
test.loc[train['building_area'] == 4, 'parking_area'] = test.loc[test['building_area'] == 4, 'building_area'] / test.loc[test['building_area'] == 4, 'total_floor']
drop_cols = [i for i in train.columns if np.sum(train[i]) == 60000 and 'index' in i]

train.drop(['town'], axis = 1, inplace = True)
test.drop(['town'], axis = 1, inplace = True)
train.drop(drop_cols, axis = 1, inplace = True)
test.drop(drop_cols, axis = 1, inplace = True)

# train.drop(train[(train['land_area'] > 1500) | (train['building_area'] >1000)].index, inplace= True)

gc.collect()

91

In [5]:
train.shape, test.shape

((60000, 419), (10000, 419))

In [30]:
category = ['building_material', 'building_use', 'building_type', 'city', 'parking_way']

ITERATION = 0

params = automated_params_tuning(
    train, 
    category,
    max_evals=100, 
    n_folds=5, 
    sample_train=20000, 
    sample_test=2000, 
    boost_rounds=10000, 
    early_stop=100,
)

100%|██████████| 100/100 [6:52:06<00:00, 205.66s/it, best loss: 0.5351111111111111] 


In [31]:
params

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.8340034681089536,
 'learning_rate': 0.023294728076622,
 'min_child_samples': 30,
 'num_leaves': 54,
 'reg_alpha': 0.2380166380203602,
 'reg_lambda': 0.8778656208518232,
 'subsample_for_bin': 200000,
 'subsample': 0.8209608370002465,
 'n_estimators': 2606}

In [7]:
params_default = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': "None",
        'num_leaves': 31,
        # 'max_bin': 512,
        'learning_rate': 0.025,
        # 'min_data_in_leaf': 100,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'bagging_freq': 10,
        'verbose': 1,
        'num_threads': -1,
    }

In [23]:
params_auto = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': "None",
    'colsample_bytree': 0.8340034681089536,
    'learning_rate': 0.023294728076622,
    'min_child_samples': 30,
    'num_leaves': 54,
    'reg_alpha': 0.2380166380203602,
    'reg_lambda': 0.8778656208518232,
    'subsample_for_bin': 200000,
    'subsample': 0.8209608370002465,
#     'n_estimators': 2606,
    'n_estimators': 10000,
    'verbose': 1,
    'num_threads': -1,
}

In [9]:
category = ['building_material', 'building_use', 'building_type', 'city', 'parking_way']

In [11]:
del train['building_id']
train['total_price'] = np.log1p(train['total_price'])

y = train['total_price']
del train['total_price']
X = train.values
y = y.values

X_train,X_test,y_train,y_test =train_test_split(X, y, test_size=0.1, random_state=31)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

def hit_score(preds, train_data):
    trues  = train_data.get_label()
    trues = np.expm1(trues)
    preds = np.expm1(preds)
    scores = ((np.absolute(preds - trues) / trues) <= 0.1)
    hit_score = np.sum(scores) / train_data.num_data()
    return 'Hit_score', hit_score, True


In [12]:
import time

In [17]:
start_time = time.time()

feature_name = [i for i in train.columns]

model = lgb.train(
        params_default, 
        lgb_train,
        num_boost_round=10000, 
        valid_sets=[lgb_train, lgb_eval], 
        early_stopping_rounds=1000,
        feval=hit_score,
        verbose_eval=1000,
        categorical_feature=category,
        feature_name=feature_name,
    )

end_time = time.time()



Training until validation scores don't improve for 1000 rounds.
[1000]	training's Hit_score: 0.495815	valid_1's Hit_score: 0.459667
[2000]	training's Hit_score: 0.575074	valid_1's Hit_score: 0.4875
[3000]	training's Hit_score: 0.644315	valid_1's Hit_score: 0.501333
[4000]	training's Hit_score: 0.703037	valid_1's Hit_score: 0.519167
[5000]	training's Hit_score: 0.755667	valid_1's Hit_score: 0.534167
[6000]	training's Hit_score: 0.802407	valid_1's Hit_score: 0.543
[7000]	training's Hit_score: 0.841037	valid_1's Hit_score: 0.552
[8000]	training's Hit_score: 0.874426	valid_1's Hit_score: 0.553333
[9000]	training's Hit_score: 0.901778	valid_1's Hit_score: 0.56
[10000]	training's Hit_score: 0.924296	valid_1's Hit_score: 0.564333
Did not meet early stopping. Best iteration is:
[9997]	training's Hit_score: 0.924315	valid_1's Hit_score: 0.564333


In [20]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred)
y_test_origin = np.expm1(y_test)

hit = np.absolute((y_test_origin - y_pred)/y_test_origin)
hit_rate = np.sum(hit < 0.1) / len(hit)
MAPE = np.sum(hit)/len(hit)
score = hit_rate*(10**4) + (1 - MAPE)

print('MAPE: ', MAPE)
print('Hit Rate: ', hit_rate * 100,'%')
print('Score: ', score)
print('Training Time: ', end_time - start_time, 's')

MAPE:  0.1324989358706005
Hit Rate:  56.43333333333334 %
Score:  5644.200834397463
Training Time:  370.66744804382324 s


In [24]:
start_time = time.time()

feature_name = [i for i in train.columns]

model = lgb.train(
        params_auto, 
        lgb_train, 
        valid_sets=[lgb_train, lgb_eval], 
        feval=hit_score,
        verbose_eval=1000,
        categorical_feature=category,
        feature_name=feature_name,
    )

end_time = time.time()

[1000]	training's Hit_score: 0.555352	valid_1's Hit_score: 0.482333
[2000]	training's Hit_score: 0.66613	valid_1's Hit_score: 0.513333
[3000]	training's Hit_score: 0.751111	valid_1's Hit_score: 0.5325
[4000]	training's Hit_score: 0.819741	valid_1's Hit_score: 0.543833
[5000]	training's Hit_score: 0.87363	valid_1's Hit_score: 0.551167
[6000]	training's Hit_score: 0.913722	valid_1's Hit_score: 0.560833
[7000]	training's Hit_score: 0.942759	valid_1's Hit_score: 0.563667
[8000]	training's Hit_score: 0.962093	valid_1's Hit_score: 0.569167
[9000]	training's Hit_score: 0.975556	valid_1's Hit_score: 0.569667
[10000]	training's Hit_score: 0.983037	valid_1's Hit_score: 0.570167


In [25]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred)
y_test_origin = np.expm1(y_test)

hit = np.absolute((y_test_origin - y_pred)/y_test_origin)
hit_rate = np.sum(hit < 0.1) / len(hit)
MAPE = np.sum(hit)/len(hit)
score = hit_rate*(10**4) + (1 - MAPE)

print('MAPE: ', MAPE)
print('Hit Rate: ', hit_rate * 100,'%')
print('Score: ', score)
print('Training Time: ', end_time - start_time, 's')

MAPE:  0.13074515255893515
Hit Rate:  57.01666666666667 %
Score:  5702.535921514108
Training Time:  655.5116527080536 s
