In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))
import os
#import xgboost as xgb
#import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
import json
import ast
import eli5
from functools import reduce
import warnings
from sklearn.metrics import mean_squared_error
from scipy import stats
from math import sqrt
from lightgbm import plot_tree
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import STATUS_OK
from hyperopt.pyll.stochastic import sample
import time
import gc
warnings.filterwarnings('ignore')
#import shap

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('dataset-0510/train.csv')
test  = pd.read_csv('dataset-0510/test.csv')
data = pd.concat([train, test], axis=0)
#train.drop(['village'], axis =1, inplace = True)
#test.drop(['village'], axis =1, inplace = True)

In [3]:
def Submission(Ids, preds):
    file_name = datetime.datetime.today().strftime('%m-%d-%H-%M')
    submission = pd.DataFrame({'building_id' : Ids, 'total_price' : preds})
    if not os.path.isdir('Submission'):
        os.makedirs('Submission')
    submission.to_csv('Submission/' + file_name + '.csv', index= False)

In [4]:
def display_importances(feature_importance_df):
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

In [5]:
def lgb_model(split_num, train, test,features, stratified = False, if_one_hot = True):
    '''
    category_cols = ['building_material',
 'building_type',
 'building_use',
 'parking_way',
 'location_2',
 'inter_btw_building_type_parking_way',
 'inter_btw_building_type_building_use',
 'inter_btw_building_type_building_material',
 'inter_btw_parking_way_building_use',
 'inter_btw_parking_way_building_material',
 'inter_btw_building_use_building_material']
    '''
    category_cols = ['building_material','building_use','parking_way', 'city']
    
    if stratified:
        kf = StratifiedKFold(n_splits = split_num, random_state = 42, shuffle = True)
    else :
        kf = KFold(n_splits = split_num, random_state=42, shuffle=True)
    train['total_price_log'] = np.log1p(train['total_price'])
    feature_importance_df = pd.DataFrame()
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    
    param ={
        'n_estimators': 10000, 'max_depth' : -1, 'num_leaves' :30,         
        'objective': 'regression',   'metric':'rmse',   
        'learning_rate': 0.01,      'boosting': 'gbdt',     'min_data_in_leaf': 10,
        'feature_fraction': 0.9,    'bagging_freq':1,       'bagging_fraction': 0.8,     'importance_type': 'gain',
        'lambda_l1': 0.2,  'subsample': .8,   'colsample_bytree': .9
    }

    features = [i for i in train.columns if i not in ['building_id', 'total_price','total_price_log']] 
     
    for fold_, (trn_idx, val_idx) in enumerate(kf.split(train[features].values,train['total_price_log'].values)):
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label= train['total_price_log'].iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx][features], label= train['total_price_log'].iloc[val_idx])
        
        
        clf = lgb.train(params= param, train_set= trn_data, valid_sets= [trn_data, val_data], verbose_eval=1000, early_stopping_rounds= 10000)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration = clf.best_iteration)
        predictions += clf.predict(test[features], num_iteration = clf.best_iteration) / kf.n_splits
        
        y   = np.expm1(train['total_price_log'].iloc[val_idx]) 
        yhat = np.expm1(oof[val_idx])
        Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
        print('fold {} hit_score : {}'.format(fold_ + 1, round(Hit_score, 4) /len(train.iloc[val_idx]) * 10000))
        print('-'*30)
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature']    = features
        fold_importance_df['importance'] = np.log1p(clf.feature_importance(importance_type='gain', iteration=clf.best_iteration))
        fold_importance_df['fold']       = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    print('CV scrore : {}'.format(sqrt(mean_squared_error(train['total_price_log'], oof))))
    print('-'*30)
    y = np.expm1(train['total_price_log']) 
    yhat = np.expm1(oof)
    Hit_score = np.sum([1 for i in np.abs((y - yhat) / y)  if i <= 0.1 ])
    print('Hit rate : {}'.format(round(Hit_score, 4) /len(train) * 10000))
    
    display_importances(feature_importance_df)
    return predictions, round(Hit_score, 4) /len(train) * 10000, oof


In [14]:
begin = time.time()
avg_hit_rate = 0
prediction_df = pd.DataFrame()
hit_score_list = []
train_num_list = []
building_df_list   = []
oof_list       = []
features = [i for i in train.columns if i not in ['building_id', 'total_price','total_price_log','city']] 
for b_type in train['building_type'].unique():
    temp_train = train[train['building_type'] == b_type]
    temp_test  = test[test['building_type'] == b_type]
    temp_train.drop(['building_type'], axis =1, inplace =True)
    temp_test.drop(['building_type'], axis =1, inplace =True)
    preds, hit_score, oof = lgb_model(5, temp_train, temp_test, features= features,if_one_hot=False)
    temp = pd.DataFrame({'building_id' : temp_test['building_id'], 'total_price' : preds})
    prediction_df = pd.concat([prediction_df, temp], axis=0)
    
    print('building_type : {}'.format(b_type))
    print('Train_num: {}'.format(len(temp_train)))
    print('Test_num: {}'.format(len(temp_test)))
    print('-'*1000)
    hit_score_list.append(hit_score)
    train_num_list.append(len(temp_train))
    building_df_list.append(b_type)
    oof_list.append(oof)
    avg_hit_rate += hit_score / 60000 * len(temp_train)

Result_df = pd.DataFrame({'City' : building_df_list,
                        'Train_num' : train_num_list,
                        'Hit_score' : hit_score_list})

print('Avg hit_score : {}'.format(avg_hit_rate))
print('總共花：{} 分'.format((time.time() - begin) / 60))
print('現在時間 ： {}'.format(datetime.datetime.today().strftime('%m-%d-%H-%M')))
Submission(prediction_df['building_id'], np.expm1(prediction_df['total_price']))

Training until validation scores don't improve for 10000 rounds.
[10000]	training's rmse: 0.0277108	valid_1's rmse: 0.177004
[20000]	training's rmse: 0.0142966	valid_1's rmse: 0.176804
Early stopping, best iteration is:
[18626]	training's rmse: 0.0152364	valid_1's rmse: 0.176792
fold 1 hit_score : 5853.889943074004
------------------------------
Training until validation scores don't improve for 10000 rounds.
[10000]	training's rmse: 0.0281996	valid_1's rmse: 0.190081
Early stopping, best iteration is:
[9703]	training's rmse: 0.0290402	valid_1's rmse: 0.190076
fold 2 hit_score : 5666.824869482677
------------------------------
Training until validation scores don't improve for 10000 rounds.
[10000]	training's rmse: 0.0285357	valid_1's rmse: 0.203201
[20000]	training's rmse: 0.0148275	valid_1's rmse: 0.203202
Early stopping, best iteration is:
[11634]	training's rmse: 0.0246648	valid_1's rmse: 0.203121
fold 3 hit_score : 5723.777883246322
------------------------------
Training until va

KeyboardInterrupt: 

In [None]:
temp_train = train[train['building_type'] == 4]
temp_test = test[test['building_type'] == 4]

temp_train = temp_train.join(pd.get_dummies(temp_train['city'], prefix = 'city'))
temp_train = temp_train.join(pd.get_dummies(temp_train['parking_way'], prefix = 'parking_way'))
temp_test = temp_test.join(pd.get_dummies(temp_test['city'], prefix = 'city'))
temp_test = temp_test.join(pd.get_dummies(temp_test['parking_way'], prefix = 'parking_way'))
temp_train['miss_parking_area'] = 0
temp_train['miss_parking_price'] = 0
temp_train.loc[temp_train['parking_area'].isna(), 'miss_parking_area'] = 1
temp_train.loc[temp_train['parking_price'].isna(), 'miss_parking_price'] = 1
temp_train['parking_price_every_area'] = temp_train['parking_price'] / temp_train['parking_area']
temp_train['parking_way'] = temp_train['parking_way'].astype('category')
temp_train = temp_train.join(pd.get_dummies(temp_train['parking_way'], prefix = 'parking_way'))
temp_train['parking_area'].fillna(0, inplace =True)
temp_train['parking_price'].fillna(0, inplace =True)

temp_test['miss_parking_area'] = 0
temp_test['miss_parking_price'] = 0
temp_test.loc[temp_test['parking_area'].isna(), 'miss_parking_area'] = 1
temp_test.loc[temp_test['parking_price'].isna(), 'miss_parking_price'] = 1
temp_test['parking_price_every_area'] = temp_test['parking_price'] / temp_test['parking_area']
temp_test['parking_way'] = temp_test['parking_way'].astype('category')
temp_test = temp_test.join(pd.get_dummies(temp_test['parking_way'], prefix = 'parking_way'))
temp_test['parking_area'].fillna(0, inplace =True)
temp_test['parking_price'].fillna(0, inplace =True)



#temp_train['mean_price'] = temp_train['total_price'] / temp_train['land_area']
#temp_test['mean_price'] = 0
features = [i for i in temp_train.columns if i not in ['building_id', 'total_price','total_price_log']] 
#temp_train = temp_train[np.abs(temp_train.land_area-temp_train.land_area.mean()) <= (3*temp_train.land_area.std())]


In [None]:
lgb_model(5, temp_train, temp_test, features= features,if_one_hot=False)

In [44]:
temp.groupby('city')['total_price'].mean()

city
3     7.381115e+06
5     3.099744e+06
6     5.417316e+06
7     2.881146e+07
9     9.084332e+06
10    7.625755e+06
12    1.013288e+07
13    1.203804e+08
14    6.643299e+06
17    4.539870e+06
21    8.812216e+06
Name: total_price, dtype: float64

In [45]:
temp.groupby('city')['total_price'].std()

city
3     6.672626e+06
5     8.017254e+06
6     8.301920e+06
7     7.814438e+07
9     1.485485e+07
10    1.266959e+07
12    1.357379e+07
13    3.975890e+08
14    1.349661e+07
17    5.696486e+06
21    1.686401e+07
Name: total_price, dtype: float64