In [77]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))
import os
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
import json
import ast
import eli5
from functools import reduce
import warnings
from sklearn.metrics import mean_squared_error
from scipy import stats
from math import sqrt
#from lightgbm import plot_tree
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample
import time
import gc
warnings.filterwarnings('ignore')
#import shap

In [181]:
train = pd.read_csv('dataset-0510/train.csv')
test  = pd.read_csv('dataset-0510/test.csv')
data  = pd.concat([train, test], axis=0)

#drop outlier
train.drop(train[(train['land_area'] > 1500) | (train['building_area'] >1000)].index, inplace= True)
gc.collect()

2861

In [None]:
# Clone non-duplicated
cols = ['city', 'town','building_type', 'total_floor', 'XIV_MIN', 'building_complete_dt']
dup_data = train[train[cols].duplicated(keep=False)]
non_dup_data = train[~train[cols].duplicated(keep=False)]
#Clone_duplicated = non_dup_data[non_dup_data['building_id'].isin(train['building_id'])]
train = pd.concat([train, non_dup_data], axis=0)
print(len(train))


In [None]:
def target_encoding(train, validation, test):
    
    groupby_cols = ['city', 'building_type', 'town']
    target_df = train.groupby(groupby_cols).agg({'building_area' : ['mean', 'median'], 'land_area' : ['mean', 'median'], 'total_price' : ['mean', 'median']}).reset_index()
    target_df.columns = [i[0] + '_' + i[1]  if i[1] != '' else i[0] for i in target_df.columns.tolist()]
    
    target_df['price_land_rate_median'] = np.log1p(target_df['total_price_median'] / target_df['land_area_median'])
    target_df['price_building_rate_median'] = np.log1p(target_df['total_price_median'] / target_df['building_area_median'])
    target_df['price_land_rate_mean'] = np.log1p(target_df['total_price_mean'] / target_df['land_area_mean'])
    target_df['price_building_rate_mean'] = np.log1p(target_df['total_price_mean'] / target_df['building_area_mean'])
    
    
    combine_cols = groupby_cols + [ 'price_land_rate_mean', 'price_building_rate_mean', 'price_building_rate_median', 'price_building_rate_median']
    train      = pd.merge(train, target_df[combine_cols], on = groupby_cols, how='left')
    validation = pd.merge(validation, target_df[combine_cols], on = groupby_cols, how='left')
    test       = pd.merge(test, target_df[combine_cols], on = groupby_cols, how='left')
    
    return train, validation, test

In [184]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59989 entries, 0 to 59999
Columns: 235 entries, building_id to total_price
dtypes: float64(37), int64(197), object(1)
memory usage: 108.0+ MB


In [185]:
def hit_score(preds, train_data):
    trues  = train_data.get_label()
    trues = np.expm1(trues)
    preds = np.expm1(preds)
    scores = ((np.absolute(preds - trues) / trues) <= 0.1)
    hit_score = np.sum(scores) / train_data.num_data()
    return 'Hit_score', hit_score, True


#categorical feature to one-hot
def one_hot(train, test, categorical_features):
    data = pd.concat([train, test], axis=0)
    for i in categorical_features:
        data = data.join(pd.get_dummies(data[i], prefix = i))
        data.drop(i, axis = 1, inplace =True)
    train = data[:60000]
    test  = data[60000:]
    return train, test

In [36]:
def display_importances(feature_importance_df):
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

In [37]:
def Submission(Ids, preds):
    file_name = datetime.datetime.today().strftime('%m-%d-%H-%M')
    submission = pd.DataFrame({'building_id' : Ids, 'total_price' : preds})
    if not os.path.isdir('Submission'):
        os.makedirs('Submission')
    submission.to_csv('Submission/' + file_name + '.csv', index= False)

In [190]:
def lgb_model(split_num, train, test, stratified = False, if_one_hot = True):
    
    category_cols = ['city', 'building_type', 'building_material', 'parking_way', 'building_use']

    if stratified:
        kf = StratifiedKFold(n_splits = split_num, random_state = 42, shuffle = True)
    else :
        kf = KFold(n_splits = split_num, random_state=42, shuffle=True)
    train['price_every_building_area'] = np.log1p(train['total_price'] / train['building_area']) 
    
    feature_importance_df = pd.DataFrame()
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    
    param ={
        'n_estimators': 10000000, 'max_depth' : -1, 'num_leaves' :30,         
        'objective': 'regression', 
        'learning_rate': 0.01,      'boosting': 'gbdt',     'min_data_in_leaf': 50,
        'feature_fraction': 0.9,    'bagging_freq':1,       'bagging_fraction': 0.8,     'importance_type': 'gain',
         'subsample': .8,   'colsample_bytree': .9, 'device' : 'cpu', 'num_threads' : 10
    }
     
    features = [i for i in train.columns if i not in ['building_id', 'total_price','price_every_building_area']] 
    print(len(features))
    
    for fold_, (trn_idx, val_idx) in enumerate(kf.split(train[features].values,train['price_every_building_area'].values)):
        begin = time.time()
        folds_train       = train.iloc[trn_idx]
        folds_validation  = train.iloc[val_idx]
        folds_test        = test

        
        folds_train, folds_validation, folds_test = target_encoding(folds_train, folds_validation, folds_test)
        #features = [i for i in features if i not in ['building_id', 'total_price','total_price_log', 'price_every_building_area','building_type', 'parking_way','building_material', 'building_use'] and 'building_area' not in i] 

        
        trn_data = lgb.Dataset(folds_train[features], label = folds_train['price_every_building_area'])
        val_data = lgb.Dataset(folds_validation[features], label = folds_validation['price_every_building_area'])

        clf = lgb.train(params = param, train_set = trn_data, valid_sets= [trn_data, val_data], verbose_eval=10000, early_stopping_rounds= 3000, categorical_feature=category_cols, feval = hit_score)
        oof[val_idx] = clf.predict(folds_validation[features], num_iteration = clf.best_iteration)
        predictions += clf.predict(folds_test[features], num_iteration = clf.best_iteration) / kf.n_splits
        
        y   = np.expm1(folds_validation['price_every_building_area']) * folds_validation['building_area']
        yhat = np.expm1(oof[val_idx]) * folds_validation['building_area']
        Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
        print('fold {} hit_score : {}'.format(fold_ + 1, round(Hit_score, 4) /len(train.iloc[val_idx]) * 10000))
        print((time.time() - begin) / 60)
        print('-'*30)
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature']    = features
        fold_importance_df['importance'] = np.log1p(clf.feature_importance(importance_type='gain', iteration=clf.best_iteration))
        fold_importance_df['fold']       = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
    print('CV scrore : {}'.format(sqrt(mean_squared_error(train['price_every_building_area'], oof))))
    print('-'*30)
    y = np.expm1(train['price_every_building_area'] ) * train['building_area']
    yhat = np.expm1(oof) * train['building_area']
    Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
    print('Hit ratye : {}'.format(round(Hit_score, 4) /len(train) * 10000))
    
    #display_importances(feature_importance_df)
    return predictions, oof, feature_importance_df

In [131]:
#7
begin = time.time()
prediction, oof, feature_importance_df = lgb_model(10, train, test, stratified = False)
prediction = np.expm1(prediction) * test['building_area']
print('總共花：{} 分'.format((time.time() - begin) / 60))
Submission(test['building_id'], prediction )

623
Training until validation scores don't improve for 10000 rounds.
[10000]	training's l2: 0.0107982	training's Hit_score: 0.722152	valid_1's l2: 0.0362552	valid_1's Hit_score: 0.538756
[20000]	training's l2: 0.00475543	training's Hit_score: 0.87757	valid_1's l2: 0.0348985	valid_1's Hit_score: 0.565594
[30000]	training's l2: 0.00257724	training's Hit_score: 0.951861	valid_1's l2: 0.0345081	valid_1's Hit_score: 0.583597
[40000]	training's l2: 0.0015768	training's Hit_score: 0.980811	valid_1's l2: 0.0343597	valid_1's Hit_score: 0.589598
[50000]	training's l2: 0.00105017	training's Hit_score: 0.990924	valid_1's l2: 0.0342952	valid_1's Hit_score: 0.591432
[60000]	training's l2: 0.000746093	training's Hit_score: 0.995073	valid_1's l2: 0.0342694	valid_1's Hit_score: 0.593432
[70000]	training's l2: 0.000559079	training's Hit_score: 0.997129	valid_1's l2: 0.0342499	valid_1's Hit_score: 0.5971
Early stopping, best iteration is:
[69635]	training's l2: 0.000564737	training's Hit_score: 0.996999	

# Record

In [None]:
1# cv :   0.20573  hit_rate :  5054   public : 5113   note : without Fe set n_estimators = 10,000
1.1# cv : 0.20580  hit_rate :  5045   public :        note : without Fe set n_estimators = 10,000 drop village
1.2# cv : 0.20352  hit_rate :  5075   public :        note : without Fe set n_estimators = 10,000 drop village one-hot
1.3# cv : 0.20570  hit_rate :  5056   public :        note : without Fe set n_estimators = 10,000 drop with category_features

# FE and use category_features   CV : 0.20355     hit_rate : 5056  with n_estimators = 10000
# FE and use one_hot             CV : 0.20402     hit_rate : 5079  with n_estimators = 10000

# FE and use one_hot             CV : 0.20336     hit_rate : 5088  with n_estimators = 10000 fillna mean 
# FE and use category_features   CV : 0.20320     hit_rate : 5104  with n_estimators = 10000 fillna mean

# FE and use one_hot             CV : 0.20342     hit_rate : 5089  with n_estimators = 10000 fillna median 
# FE and use category_features   CV : 0.2032     hit_rate : 5097  with n_estimators = 10000 fillna median 

# FE and use one_hot             CV : 0.20342     hit_rate : 5089  with n_estimators = 10000 add min I、II...
# FE and use category_features   CV : 0.20362     hit_rate : 5152.0  with n_estimators = 10000  add min I、II...

2#    cv : 0.19764  hit_rate :         public : 5737          note : without FE set n_estimators = 1,000,000
2.1#  cv : 0.19576  hit_rate : 5634    public : 5803.8749     note : FE set n_estimators = 1,000,000 with one-hot
2.2#  cv : 0.19582  hit_rate : 5622    public : 5803.8754     note : FE set n_estimators = 1,000,000 with category_features



3# cv : 0.20471  hit_rate :  5063   public :          note :  FE set n_estimators = 10,000  without one-hot
4# cv : 0.20536  hit_rate :  5028   public :          note :  FE set n_estimators = 10,000  with one-hot    drop village
5# cv : 0.20445  hit_rate :  5081   public :          note :  FE set n_estimators = 10,000  with one-hot   
6# cv : 0.20352  hit_rate :  5075   public : 3237     note :  FE set n_estimators = 10,000  with one-hot   combine city、village and groupby encoding

7# cv : 0.19362  hit_rate :  5703   public : 5857     note :  just target-encoding and 10-fold
