In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score

SEED = 12345

INPUT_FILE = "data_v9.csv"
OUTPUT_FILE = "sub_lightgbm_v9_3_5.csv"

In [15]:
from google.colab import drive
drive.mount("/content/drive/")
INPUT_PATH_1 = "/content/drive/My Drive/HousePrices/input/"
INPUT_PATH_2 = "/content/drive/My Drive/HousePrices/features/"
OUTPUT_PATH = "/content/drive/My Drive/HousePrices/prediction/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [16]:
df = pd.read_csv(INPUT_PATH_2 + INPUT_FILE)
print(df.shape)
df.head(3)

(2919, 42)


Unnamed: 0,Id,GrLivArea,TotalBsmtSF,LotArea,BsmtFinSF1,GarageArea,OverallQual,1stFlrSF,YearBuilt,LotFrontage,...,TotalBsmtSF_BY_YearBuilt,GarageArea_BY_Neighborhood,GarageArea_BY_OverallQual,GarageArea_BY_YearBuilt,1stFlrSF_BY_Neighborhood,1stFlrSF_BY_OverallQual,1stFlrSF_BY_YearBuilt,YearRemodAdd_BY_Neighborhood,YearRemodAdd_BY_OverallQual,YearRemodAdd_BY_YearBuilt
0,784,1110,1097,9101,1097,602,5,1110,1978,74,...,58,100,217,89,-83,40,-74,-7,4,-2
1,1804,941,941,8604,941,564,5,941,1978,75,...,-97,140,179,51,-234,-128,-243,7,4,-2
2,880,864,864,7000,646,336,5,864,1978,70,...,-174,-233,-48,-176,-309,-205,-320,5,31,24


In [17]:
x_train = df.loc[(df['train_test'].isin([1,2])), :].drop(columns=['train_test','SalePrice','Id'], axis=1)
y_train = df.loc[(df['train_test'].isin([1,2])), :]['SalePrice']
y_train = np.log(y_train)
print(x_train.shape)
print(y_train.shape)
x_train.head(3)

(1460, 39)
(1460,)


Unnamed: 0,GrLivArea,TotalBsmtSF,LotArea,BsmtFinSF1,GarageArea,OverallQual,1stFlrSF,YearBuilt,LotFrontage,Neighborhood,...,TotalBsmtSF_BY_YearBuilt,GarageArea_BY_Neighborhood,GarageArea_BY_OverallQual,GarageArea_BY_YearBuilt,1stFlrSF_BY_Neighborhood,1stFlrSF_BY_OverallQual,1stFlrSF_BY_YearBuilt,YearRemodAdd_BY_Neighborhood,YearRemodAdd_BY_OverallQual,YearRemodAdd_BY_YearBuilt
0,1110,1097,9101,1097,602,5,1110,1978,74,11,...,58,100,217,89,-83,40,-74,-7,4,-2
2,864,864,7000,646,336,5,864,1978,70,5,...,-174,-233,-48,-176,-309,-205,-320,5,31,24
3,1040,1040,8430,616,0,5,1040,1978,60,5,...,1,-569,-384,-512,-133,-29,-144,-21,4,-2


In [18]:
model_list = []
rmse_list = []
r2_list = []
cat_cols = ['Neighborhood','MoSold','YearBuilt']
#cat_cols = ['OverallQual','Neighborhood','OverallCond','BsmtFinType1','MoSold']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
#skf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for idx_train, idx_valid in list(skf.split(x_train, x_train['Neighborhood'])):
#for idx_train, idx_valid in list(skf.split(x_train, y_train)):
    x_cv = x_train.iloc[idx_train]
    y_cv = y_train.iloc[idx_train]
    x_valid = x_train.iloc[idx_valid]
    y_valid = y_train.iloc[idx_valid]
    #
    model = lgb.LGBMRegressor(random_state=SEED, verbose=-1)
    model.fit(x_cv, y_cv, eval_set=[(x_valid, y_valid)], categorical_feature=cat_cols, callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()])
    #model.fit(x_cv, y_cv, eval_set=[(x_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()])
    pred_valid = model.predict(x_valid)
    model_list.append(model)
    rmse_list.append(mean_squared_error(y_valid, pred_valid, squared=False))
    #rmse_list.append(mean_squared_error(np.log(y_valid), np.log(pred_valid), squared=False))
    r2_list.append(r2_score(y_valid, pred_valid))

New categorical_feature is ['MoSold', 'Neighborhood', 'YearBuilt']


[1]	valid_0's l2: 0.134809
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.115338
[3]	valid_0's l2: 0.0995675
[4]	valid_0's l2: 0.0862057
[5]	valid_0's l2: 0.0748179
[6]	valid_0's l2: 0.0656229
[7]	valid_0's l2: 0.0576664
[8]	valid_0's l2: 0.0512315
[9]	valid_0's l2: 0.0459913
[10]	valid_0's l2: 0.0418518
[11]	valid_0's l2: 0.03804
[12]	valid_0's l2: 0.0346154
[13]	valid_0's l2: 0.0319486
[14]	valid_0's l2: 0.0296626
[15]	valid_0's l2: 0.0276647
[16]	valid_0's l2: 0.0259126
[17]	valid_0's l2: 0.0246498
[18]	valid_0's l2: 0.0235081
[19]	valid_0's l2: 0.0223855
[20]	valid_0's l2: 0.0216311
[21]	valid_0's l2: 0.0208698
[22]	valid_0's l2: 0.0202743
[23]	valid_0's l2: 0.0196986
[24]	valid_0's l2: 0.0193329
[25]	valid_0's l2: 0.0189821
[26]	valid_0's l2: 0.0185979
[27]	valid_0's l2: 0.0183586
[28]	valid_0's l2: 0.0180621
[29]	valid_0's l2: 0.017828
[30]	valid_0's l2: 0.0175413
[31]	valid_0's l2: 0.0172821
[32]	valid_0's l2: 0.0171466
[33]	valid_0's l2: 0.0170

New categorical_feature is ['MoSold', 'Neighborhood', 'YearBuilt']
New categorical_feature is ['MoSold', 'Neighborhood', 'YearBuilt']


[30]	valid_0's l2: 0.0209388
[31]	valid_0's l2: 0.0208262
[32]	valid_0's l2: 0.0207873
[33]	valid_0's l2: 0.0207763
[34]	valid_0's l2: 0.0207014
[35]	valid_0's l2: 0.0207345
[36]	valid_0's l2: 0.0207752
[37]	valid_0's l2: 0.0208128
[38]	valid_0's l2: 0.0207089
[39]	valid_0's l2: 0.0206272
[40]	valid_0's l2: 0.0206381
[41]	valid_0's l2: 0.0206165
[42]	valid_0's l2: 0.0205542
[43]	valid_0's l2: 0.0205655
[44]	valid_0's l2: 0.0206234
[45]	valid_0's l2: 0.0206116
[46]	valid_0's l2: 0.0206403
[47]	valid_0's l2: 0.0205818
[48]	valid_0's l2: 0.0205585
[49]	valid_0's l2: 0.0206423
[50]	valid_0's l2: 0.0206447
[51]	valid_0's l2: 0.0207281
[52]	valid_0's l2: 0.0206989
Early stopping, best iteration is:
[42]	valid_0's l2: 0.0205542
[1]	valid_0's l2: 0.123114
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.103904
[3]	valid_0's l2: 0.0885787
[4]	valid_0's l2: 0.0752914
[5]	valid_0's l2: 0.0646154
[6]	valid_0's l2: 0.0563899
[7]	valid_0's l2: 0.0494126
[8]	valid_0's 

New categorical_feature is ['MoSold', 'Neighborhood', 'YearBuilt']


[53]	valid_0's l2: 0.0212969
[54]	valid_0's l2: 0.0212439
[55]	valid_0's l2: 0.0212567
[56]	valid_0's l2: 0.0212015
[57]	valid_0's l2: 0.0212232
[58]	valid_0's l2: 0.0212257
[59]	valid_0's l2: 0.0211673
[60]	valid_0's l2: 0.021198
[61]	valid_0's l2: 0.0211812
[62]	valid_0's l2: 0.021213
[63]	valid_0's l2: 0.0211635
[64]	valid_0's l2: 0.0211287
[65]	valid_0's l2: 0.0211461
[66]	valid_0's l2: 0.0211128
[67]	valid_0's l2: 0.0210641
[68]	valid_0's l2: 0.0210539
[69]	valid_0's l2: 0.0210663
[70]	valid_0's l2: 0.0210475
[71]	valid_0's l2: 0.0210878
[72]	valid_0's l2: 0.0211056
[73]	valid_0's l2: 0.0210817
[74]	valid_0's l2: 0.021141
[75]	valid_0's l2: 0.0212118
[76]	valid_0's l2: 0.02119
[77]	valid_0's l2: 0.0211977
[78]	valid_0's l2: 0.0211674
[79]	valid_0's l2: 0.0211429
[80]	valid_0's l2: 0.021171
Early stopping, best iteration is:
[70]	valid_0's l2: 0.0210475
[1]	valid_0's l2: 0.153863
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.131994
[3]	valid_0's l

New categorical_feature is ['MoSold', 'Neighborhood', 'YearBuilt']


[20]	valid_0's l2: 0.0269623
[21]	valid_0's l2: 0.0258394
[22]	valid_0's l2: 0.0250891
[23]	valid_0's l2: 0.0243938
[24]	valid_0's l2: 0.0238572
[25]	valid_0's l2: 0.0234792
[26]	valid_0's l2: 0.0231381
[27]	valid_0's l2: 0.0228435
[28]	valid_0's l2: 0.0226213
[29]	valid_0's l2: 0.0224166
[30]	valid_0's l2: 0.0221574
[31]	valid_0's l2: 0.0218872
[32]	valid_0's l2: 0.0216335
[33]	valid_0's l2: 0.0215309
[34]	valid_0's l2: 0.0213307
[35]	valid_0's l2: 0.0211585
[36]	valid_0's l2: 0.0210657
[37]	valid_0's l2: 0.0209177
[38]	valid_0's l2: 0.0208056
[39]	valid_0's l2: 0.0207545
[40]	valid_0's l2: 0.0207221
[41]	valid_0's l2: 0.0207088
[42]	valid_0's l2: 0.0206381
[43]	valid_0's l2: 0.0205807
[44]	valid_0's l2: 0.0204723
[45]	valid_0's l2: 0.0203724
[46]	valid_0's l2: 0.0204118
[47]	valid_0's l2: 0.0204199
[48]	valid_0's l2: 0.0204026
[49]	valid_0's l2: 0.0204558
[50]	valid_0's l2: 0.0204331
[51]	valid_0's l2: 0.0204688
[52]	valid_0's l2: 0.0204852
[53]	valid_0's l2: 0.0204485
[54]	valid_0's

In [19]:
print("rmse(cv)：avg={}".format(np.mean(rmse_list)))
print("rmse(cv)：std={}".format(np.std(rmse_list)))
print("r2(cv)：avg={}".format(np.mean(r2_list)))
print("r2(cv)：std={}".format(np.std(r2_list)))

rmse(cv)：avg=0.13567668393500648
rmse(cv)：std=0.010083291018753764
r2(cv)：avg=0.8838871212556138
r2(cv)：std=0.01483257199404251


In [20]:
# submission
df_sub = df.loc[(df['train_test']==3), :].drop(columns=['train_test','Id','SalePrice'], axis=1)
print(df_sub.shape)
df_sub.head(3)

(1459, 39)


Unnamed: 0,GrLivArea,TotalBsmtSF,LotArea,BsmtFinSF1,GarageArea,OverallQual,1stFlrSF,YearBuilt,LotFrontage,Neighborhood,...,TotalBsmtSF_BY_YearBuilt,GarageArea_BY_Neighborhood,GarageArea_BY_OverallQual,GarageArea_BY_YearBuilt,1stFlrSF_BY_Neighborhood,1stFlrSF_BY_OverallQual,1stFlrSF_BY_YearBuilt,YearRemodAdd_BY_Neighborhood,YearRemodAdd_BY_OverallQual,YearRemodAdd_BY_YearBuilt
1,941,941,8604,941,564,5,941,1978,75,12,...,-97,140,179,51,-234,-128,-243,7,4,-2
4,907,907,16269,625,343,5,907,1978,70,5,...,-131,-226,-41,-169,-266,-162,-277,-21,4,-2
5,1174,864,12250,781,528,5,1174,1978,70,5,...,-174,-41,143,15,0,104,-10,-21,4,-2


In [21]:
df_out = df.loc[(df['train_test']==3), :][['Id']]
print(df_out.shape)
df_out.head(3)

(1459, 1)


Unnamed: 0,Id
1,1804
4,1879
5,2171


In [22]:
# 予測
for i in range(len(model_list)):
  ary_pred = model_list[i].predict(df_sub)
  df_out["pred_{}".format(i)] = np.exp(ary_pred)
df_out.head(3)

Unnamed: 0,Id,pred_0,pred_1,pred_2,pred_3,pred_4
1,1804,147835.098645,144037.927359,140198.214541,142410.476388,141389.092571
4,1879,132744.425268,133231.597727,136161.41415,130074.288904,141068.714199
5,2171,155317.862615,147985.986522,157095.691178,155282.66009,149992.825472


In [23]:
df_out['SalePrice'] = np.mean(df_out.iloc[:, 1:len(model_list)+1], axis=1)
df_out = df_out[['Id', 'SalePrice']]
df_out.head(3)

Unnamed: 0,Id,SalePrice
1,1804,143174.161901
4,1879,134656.08805
5,2171,153135.005175


In [24]:
df_out.to_csv(OUTPUT_PATH + OUTPUT_FILE, index=False)
df_out.head(3)

Unnamed: 0,Id,SalePrice
1,1804,143174.161901
4,1879,134656.08805
5,2171,153135.005175
