## 区ごとに分けて学習させる
## CVあり

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100
import japanize_matplotlib

from time import time
import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import gc

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [123]:
train=pd.read_feather('../data/train_feat3.ftr')
test=pd.read_feather('../data/test_feat3.ftr')

In [124]:
drop_cols=['rent/S_mean', 'rent/S_max',
       'rent/S_min', 'rent/S_median', 'rent/S_std', 'rent_mean', 'rent_max',
       'rent_min', 'rent_median', 'rent_std']
train.drop(drop_cols,axis=1,inplace=True)
test.drop(drop_cols,axis=1,inplace=True)

train.drop(['id'],axis=1,inplace=True)
test.drop(['id','level_0','index'],axis=1,inplace=True)

train['rent']=np.log(train['rent'])

In [125]:
def build_x_y_data(df,address_type):
    y_data=df[df['address_city']==address_type]['rent']
    X_data=df[df['address_city']==address_type].drop(['rent'],axis=1)
    return X_data,y_data

In [126]:
params = {
    'objective':'regression', 
    'max_bin' : 200,
    'n_estimators' : 20000,
    'learning_rate': 0.01,
    'min_data_in_leaf' : 50,
    'num_leaves' : 100,
    'sparse_threshold' : 1.0,
    'device' : 'cpu',
    'save_binary': True,
    'seed' : 42,
    'feature_fraction_seed': 42,
    'bagging_seed' : 42,
    'drop_seed' : 42,
    'data_random_seed' : 42,
    'objective' : 'regression',
    'boosting_type' : 'gbdt',
    'verbose' : 0,
    'metric' : 'RMSE',
    'is_unbalance' : True,
    'boost_from_average' : False,
}

In [127]:
def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [129]:
def train_and_predict_for_address_type(address_type,submission,splits=5,random_state=0): 

    X_train,Y_train = build_x_y_data(train,address_type)
    X_test,_ = build_x_y_data(test,address_type)
    folds = KFold(n_splits=splits,shuffle=True)

    categorical_features=list(X_train.columns[X_train.dtypes=='category'])

    aucs = list()
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = X_train.columns

    training_start_time = time()

    cv={}
    y_preds = np.zeros(X_test.shape[0])

    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train,Y_train)):
        cv_fold_start_time = time()
        print ('** Training fold {}'.format(fold_n + 1))
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx]
        y_trn, y_val = Y_train.iloc[trn_idx], Y_train.iloc[val_idx]
        eval_set  = [(X_trn,y_trn), (X_val, y_val)]

        reg = lgb.LGBMRegressor(**params)
        reg.fit(X_trn, y_trn,
                eval_set=eval_set, 
                eval_metric="rmse",
                early_stopping_rounds=100,
                categorical_feature = categorical_features,
                verbose= 500)

        del X_trn, y_trn

        val_pred=reg.predict(X_val)
        feature_importances['fold_{}'.format(fold_n + 1)] = reg.feature_importances_

        y_preds+=reg.predict(X_test)/splits
        del reg, X_val
        val_rmse=rmse(np.exp(y_val),np.exp(val_pred))
        print('RMSE accuracy: {}'.format(val_rmse))
        cv[fold_n+1]=val_rmse
        del val_pred,y_val,val_rmse

        gc.collect()

        cv_fold_end_time = time()
        print ('fold completed in {}s'.format(cv_fold_end_time - cv_fold_start_time))
    cv=pd.DataFrame(cv,index=['cv',])
    print('CV RMSE:{}'.format(cv.mean(axis=1)))
    tmp=test.copy()
    submission.loc[tmp['address_city']==address_type,1]=np.exp(y_preds)
    
    return cv.mean(axis=1)[0]

In [130]:
sub=pd.read_csv('../../../input/sample_submit.csv',header=None)
submission = sub.copy()
FOLD=5

cv_scores = {}
for address_type in set(train['address_city'].unique()):
    cv_score = train_and_predict_for_address_type(
        address_type, submission, splits=FOLD)
    cv_scores[address_type] = cv_score

** Training fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.102877	valid_1's rmse: 0.120602
[1000]	training's rmse: 0.0447829	valid_1's rmse: 0.0894327
[1500]	training's rmse: 0.0330722	valid_1's rmse: 0.087241
[2000]	training's rmse: 0.0259252	valid_1's rmse: 0.08603
[2500]	training's rmse: 0.0210572	valid_1's rmse: 0.0852418
[3000]	training's rmse: 0.0175855	valid_1's rmse: 0.0847711
[3500]	training's rmse: 0.0150131	valid_1's rmse: 0.0844784
[4000]	training's rmse: 0.0129072	valid_1's rmse: 0.0842155
[4500]	training's rmse: 0.0113667	valid_1's rmse: 0.0840708
[5000]	training's rmse: 0.0100435	valid_1's rmse: 0.0839146
[5500]	training's rmse: 0.00889222	valid_1's rmse: 0.0838278
[6000]	training's rmse: 0.00789184	valid_1's rmse: 0.0837742
[6500]	training's rmse: 0.006982	valid_1's rmse: 0.0837497
Early stopping, best iteration is:
[6501]	training's rmse: 0.00697941	valid_1's rmse: 0.0837484
RMSE accuracy: 11091.94940450327
fold completed

[4000]	training's rmse: 0.0170184	valid_1's rmse: 0.0948009
[4500]	training's rmse: 0.0151504	valid_1's rmse: 0.0946563
[5000]	training's rmse: 0.0135823	valid_1's rmse: 0.0945522
[5500]	training's rmse: 0.0122359	valid_1's rmse: 0.0944557
Early stopping, best iteration is:
[5530]	training's rmse: 0.01216	valid_1's rmse: 0.0944487
RMSE accuracy: 11969.10582829343
fold completed in 6.533425807952881s
** Training fold 4
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.103574	valid_1's rmse: 0.132442
[1000]	training's rmse: 0.0485644	valid_1's rmse: 0.10049
[1500]	training's rmse: 0.0373443	valid_1's rmse: 0.0985862
[2000]	training's rmse: 0.0299032	valid_1's rmse: 0.0975164
[2500]	training's rmse: 0.024692	valid_1's rmse: 0.0964186
[3000]	training's rmse: 0.0208346	valid_1's rmse: 0.0958988
[3500]	training's rmse: 0.0178397	valid_1's rmse: 0.0953834
[4000]	training's rmse: 0.0155456	valid_1's rmse: 0.0951212
[4500]	training's rmse: 0.0135829	valid_1

Early stopping, best iteration is:
[2610]	training's rmse: 0.0401005	valid_1's rmse: 0.087965
RMSE accuracy: 24592.716228348345
fold completed in 3.0662059783935547s
** Training fold 4
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.140937	valid_1's rmse: 0.142178
[1000]	training's rmse: 0.0802097	valid_1's rmse: 0.108526
[1500]	training's rmse: 0.0619495	valid_1's rmse: 0.100907
[2000]	training's rmse: 0.0511914	valid_1's rmse: 0.096477
[2500]	training's rmse: 0.0430372	valid_1's rmse: 0.0943158
[3000]	training's rmse: 0.0362037	valid_1's rmse: 0.0919893
[3500]	training's rmse: 0.031024	valid_1's rmse: 0.0911388
[4000]	training's rmse: 0.0272814	valid_1's rmse: 0.0902279
[4500]	training's rmse: 0.0241739	valid_1's rmse: 0.0894067
[5000]	training's rmse: 0.021618	valid_1's rmse: 0.088779
[5500]	training's rmse: 0.0195039	valid_1's rmse: 0.0883425
Early stopping, best iteration is:
[5580]	training's rmse: 0.0192084	valid_1's rmse: 0.0882441
RMSE a

[17500]	training's rmse: 0.00432308	valid_1's rmse: 0.0894154
[18000]	training's rmse: 0.00412084	valid_1's rmse: 0.0893331
[18500]	training's rmse: 0.00392596	valid_1's rmse: 0.0892527
[19000]	training's rmse: 0.00374294	valid_1's rmse: 0.0891659
[19500]	training's rmse: 0.00357158	valid_1's rmse: 0.0890753
[20000]	training's rmse: 0.00340974	valid_1's rmse: 0.088993
Did not meet early stopping. Best iteration is:
[20000]	training's rmse: 0.00340974	valid_1's rmse: 0.088993
RMSE accuracy: 20385.944023220607
fold completed in 30.49404215812683s
CV RMSE:cv    14860.468012
dtype: float64
** Training fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.110677	valid_1's rmse: 0.133655
[1000]	training's rmse: 0.0540463	valid_1's rmse: 0.108177
[1500]	training's rmse: 0.0417909	valid_1's rmse: 0.105888
[2000]	training's rmse: 0.0337373	valid_1's rmse: 0.104505
[2500]	training's rmse: 0.0276366	valid_1's rmse: 0.103742
[3000]	training's rmse: 0.023195

[1500]	training's rmse: 0.0444889	valid_1's rmse: 0.0865693
[2000]	training's rmse: 0.0390961	valid_1's rmse: 0.0857071
[2500]	training's rmse: 0.0354961	valid_1's rmse: 0.0853066
[3000]	training's rmse: 0.0327506	valid_1's rmse: 0.0848518
[3500]	training's rmse: 0.0305836	valid_1's rmse: 0.0846741
[4000]	training's rmse: 0.0287355	valid_1's rmse: 0.0844476
[4500]	training's rmse: 0.027122	valid_1's rmse: 0.0842664
[5000]	training's rmse: 0.0256911	valid_1's rmse: 0.0841161
[5500]	training's rmse: 0.0243868	valid_1's rmse: 0.0840255
[6000]	training's rmse: 0.0231758	valid_1's rmse: 0.083959
Early stopping, best iteration is:
[6102]	training's rmse: 0.0229291	valid_1's rmse: 0.0839418
RMSE accuracy: 10121.327047321714
fold completed in 14.874813079833984s
** Training fold 5
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.107841	valid_1's rmse: 0.117974
[1000]	training's rmse: 0.0552761	valid_1's rmse: 0.0812576
[1500]	training's rmse: 0.0459141	va

[2000]	training's rmse: 0.0230944	valid_1's rmse: 0.102498
[2500]	training's rmse: 0.0178105	valid_1's rmse: 0.102083
Early stopping, best iteration is:
[2526]	training's rmse: 0.0175877	valid_1's rmse: 0.102057
RMSE accuracy: 15218.878488190943
fold completed in 5.3784260749816895s
** Training fold 3
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.101685	valid_1's rmse: 0.123753
[1000]	training's rmse: 0.0443265	valid_1's rmse: 0.0905135
Early stopping, best iteration is:
[1136]	training's rmse: 0.0402941	valid_1's rmse: 0.0902564
RMSE accuracy: 12342.782571430598
fold completed in 2.8759350776672363s
** Training fold 4
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.10105	valid_1's rmse: 0.124041
[1000]	training's rmse: 0.0429994	valid_1's rmse: 0.0975918
[1500]	training's rmse: 0.0309665	valid_1's rmse: 0.0965448
Early stopping, best iteration is:
[1576]	training's rmse: 0.0295214	valid_1's rmse: 0.09645

Early stopping, best iteration is:
[2065]	training's rmse: 0.0258514	valid_1's rmse: 0.094801
RMSE accuracy: 20097.92981216385
fold completed in 4.7499940395355225s
** Training fold 5
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.100538	valid_1's rmse: 0.109519
[1000]	training's rmse: 0.043849	valid_1's rmse: 0.0777701
[1500]	training's rmse: 0.0335213	valid_1's rmse: 0.0763721
[2000]	training's rmse: 0.0266326	valid_1's rmse: 0.0758259
[2500]	training's rmse: 0.0219068	valid_1's rmse: 0.07565
Early stopping, best iteration is:
[2440]	training's rmse: 0.0223727	valid_1's rmse: 0.075641
RMSE accuracy: 12080.5818177263
fold completed in 5.188877820968628s
CV RMSE:cv    12386.43837
dtype: float64
** Training fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.133254	valid_1's rmse: 0.129132
[1000]	training's rmse: 0.0739753	valid_1's rmse: 0.0967038
[1500]	training's rmse: 0.0581754	valid_1's rmse: 0.0949

[3000]	training's rmse: 0.0226735	valid_1's rmse: 0.0996567
Early stopping, best iteration is:
[3124]	training's rmse: 0.0216134	valid_1's rmse: 0.0995676
RMSE accuracy: 27673.8137945512
fold completed in 6.1695520877838135s
** Training fold 4
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.119229	valid_1's rmse: 0.146896
[1000]	training's rmse: 0.0584541	valid_1's rmse: 0.114003
[1500]	training's rmse: 0.044201	valid_1's rmse: 0.111371
[2000]	training's rmse: 0.0349805	valid_1's rmse: 0.109911
[2500]	training's rmse: 0.0277223	valid_1's rmse: 0.108511
[3000]	training's rmse: 0.022282	valid_1's rmse: 0.107454
[3500]	training's rmse: 0.0183734	valid_1's rmse: 0.10671
[4000]	training's rmse: 0.0155087	valid_1's rmse: 0.106313
Early stopping, best iteration is:
[4028]	training's rmse: 0.0153708	valid_1's rmse: 0.106296
RMSE accuracy: 55550.69176586951
fold completed in 8.024546146392822s
** Training fold 5
Training until validation scores don't impr

[1000]	training's rmse: 0.0484232	valid_1's rmse: 0.100399
[1500]	training's rmse: 0.036366	valid_1's rmse: 0.097503
[2000]	training's rmse: 0.0283937	valid_1's rmse: 0.0961031
[2500]	training's rmse: 0.02274	valid_1's rmse: 0.0950443
[3000]	training's rmse: 0.0184393	valid_1's rmse: 0.0946315
[3500]	training's rmse: 0.0152709	valid_1's rmse: 0.0941383
[4000]	training's rmse: 0.012797	valid_1's rmse: 0.0937358
[4500]	training's rmse: 0.0108736	valid_1's rmse: 0.0934579
[5000]	training's rmse: 0.00935502	valid_1's rmse: 0.0932841
[5500]	training's rmse: 0.00817124	valid_1's rmse: 0.0930848
[6000]	training's rmse: 0.00722843	valid_1's rmse: 0.0929751
[6500]	training's rmse: 0.00645887	valid_1's rmse: 0.092906
[7000]	training's rmse: 0.00583534	valid_1's rmse: 0.0928413
Early stopping, best iteration is:
[7145]	training's rmse: 0.00567171	valid_1's rmse: 0.0928054
RMSE accuracy: 8480.549914443793
fold completed in 13.575146198272705s
** Training fold 2
Training until validation scores don

In [133]:
submission.to_csv('feat3_separate_address_type_logrent_lightgbm.csv',header=False,index=False)


In [145]:
print('CV mean:{}'.format(pd.DataFrame(cv_scores,index=['address_type']).mean(axis=1)[0]))

CV mean:17802.107504080806


In [150]:
pd.DataFrame(cv_scores,index=['address_type']).to_csv('CV_feat3_separate_address_type_lightgbm.csv',index=False)