In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 100
import japanize_matplotlib

from time import time
import seaborn as sns
import pandas_profiling as pdp
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
import gc

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [4]:
train=pd.read_feather('../data/train_feat3.ftr')
test=pd.read_feather('../data/test_feat3.ftr')

drop_cols=['rent/S_mean', 'rent/S_max',
       'rent/S_min', 'rent/S_median', 'rent/S_std', 'rent_mean', 'rent_max',
       'rent_min', 'rent_median', 'rent_std']
train.drop(drop_cols,axis=1,inplace=True)
test.drop(drop_cols,axis=1,inplace=True)

train.drop(['id'],axis=1,inplace=True)
test.drop(['id','level_0','index'],axis=1,inplace=True)

train['rent']=np.log(train['rent'])

In [5]:
def build_x_y_data(df,address_type):
    y_data=df[df['address_city']==address_type]['rent']
    X_data=df[df['address_city']==address_type].drop(['rent'],axis=1)
    return X_data,y_data

In [6]:
params = {
    'objective':'regression', 
    'max_bin' : 200,
    'n_estimators' : 20000,
    'learning_rate': 0.01,
    'min_data_in_leaf' : 50,
    'num_leaves' : 100,
    'sparse_threshold' : 1.0,
    'device' : 'cpu',
    'save_binary': True,
    'seed' : 42,
    'feature_fraction_seed': 42,
    'bagging_seed' : 42,
    'drop_seed' : 42,
    'data_random_seed' : 42,
    'objective' : 'regression',
    'boosting_type' : 'gbdt',
    'verbose' : 0,
    'metric' : 'RMSE',
    'is_unbalance' : True,
    'boost_from_average' : False,
}

In [20]:
def train_and_predict_for_address_type(address_type,submission): 

    X_train,Y_train = build_x_y_data(train,address_type)
    X_test,_ = build_x_y_data(test,address_type)
    
    categorical_features=list(X_train.columns[X_train.dtypes=='category'])
        
    reg = lgb.LGBMRegressor(**params)
    reg.fit(X_train, Y_train,
            eval_metric="rmse",
            categorical_feature = categorical_features)

    del X_train, Y_train
    
    y_pred=reg.predict(X_test)
    del reg

    gc.collect()

    cv_fold_end_time = time()
    tmp=test.copy()
    submission.loc[tmp['address_city']==address_type,1]=np.exp(y_pred)
    

In [21]:
sub=pd.read_csv('../../../input/sample_submit.csv',header=None)
submission = sub.copy()
FOLD=5


for address_type in tqdm(set(train['address_city'].unique())):
    cv_score = train_and_predict_for_address_type(
        address_type, submission)


  0%|          | 0/23 [00:00<?, ?it/s][A
  4%|▍         | 1/23 [00:57<21:09, 57.72s/it][A
  9%|▊         | 2/23 [01:31<17:42, 50.57s/it][A
 13%|█▎        | 3/23 [01:50<13:42, 41.11s/it][A
 17%|█▋        | 4/23 [02:03<10:20, 32.68s/it][A
 22%|██▏       | 5/23 [02:09<07:22, 24.57s/it][A
 26%|██▌       | 6/23 [02:21<05:55, 20.90s/it][A
 30%|███       | 7/23 [02:47<05:57, 22.34s/it][A
 35%|███▍      | 8/23 [03:13<05:54, 23.63s/it][A
 39%|███▉      | 9/23 [03:58<06:59, 29.99s/it][A
 43%|████▎     | 10/23 [04:23<06:09, 28.45s/it][A
 48%|████▊     | 11/23 [04:55<05:51, 29.32s/it][A
 52%|█████▏    | 12/23 [05:38<06:08, 33.51s/it][A
 57%|█████▋    | 13/23 [06:17<05:51, 35.19s/it][A
 61%|██████    | 14/23 [06:58<05:32, 36.93s/it][A
 65%|██████▌   | 15/23 [07:38<05:03, 37.93s/it][A
 70%|██████▉   | 16/23 [08:06<04:04, 34.90s/it][A
 74%|███████▍  | 17/23 [08:45<03:36, 36.05s/it][A
 78%|███████▊  | 18/23 [09:14<02:50, 34.06s/it][A
 83%|████████▎ | 19/23 [09:57<02:27, 36.82s/it]

In [23]:
submission.to_csv('feat3_separate_address_type_logrent_lightgbm_NoCV.csv',index=False,header=False)