In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

SEED = 12345

INPUT_FILE = "data_v10_3.csv"
OUTPUT_FILE = "sub_catboost_v10_3_2.csv"

In [3]:
from google.colab import drive
drive.mount("/content/drive/")
INPUT_PATH_1 = "/content/drive/My Drive/HousePrices/input/"
INPUT_PATH_2 = "/content/drive/My Drive/HousePrices/features/"
OUTPUT_PATH = "/content/drive/My Drive/HousePrices/prediction/"

Mounted at /content/drive/


In [4]:
df = pd.read_csv(INPUT_PATH_2 + INPUT_FILE)
print(df.shape)
df.head(3)

(2917, 72)


Unnamed: 0,1stFlrSF_BY_Neighborhood,GarageCars,TotalBsmtSF_BY_OverallCond,MoSold,1stFlrSF_BY_OverallCond,train_test,Neighborhood_OverallCond,Neighborhood_YearBuilt,GrLivArea_BY_OverallCond,Id,...,LotArea,TotalBsmtSF_BY_YearBuilt,YearRemodAdd_BY_OverallCond,YearBuilt,GarageYrBlt,2ndFlrSF,LotShape,MasVnrArea_BY_OverallCond,BsmtFinType1,GarageArea_BY_OverallQual
0,-83,2.0,136,7,21,1,-21066,-19094,-247,784,...,9101,58,5,1978,1978,0,0.0,28,2,217
1,0,2.0,-96,5,85,3,-15530,-13558,-183,2171,...,12250,-174,5,1978,1978,0,0.0,104,0,143
2,-281,2.0,-72,6,-176,1,-21066,-19094,-445,361,...,7540,-150,5,1978,1978,0,0.0,-75,2,33


In [5]:
x_train = df.loc[(df['train_test'].isin([1,2])), :].drop(columns=['train_test','SalePrice','Id'], axis=1)
y_train = df.loc[(df['train_test'].isin([1,2])), :]['SalePrice']
y_train = np.log(y_train)
print(x_train.shape)
print(y_train.shape)
x_train.head(3)

(1458, 69)
(1458,)


Unnamed: 0,1stFlrSF_BY_Neighborhood,GarageCars,TotalBsmtSF_BY_OverallCond,MoSold,1stFlrSF_BY_OverallCond,Neighborhood_OverallCond,Neighborhood_YearBuilt,GrLivArea_BY_OverallCond,BsmtFinSF1,Neighborhood_OverallQual,...,LotArea,TotalBsmtSF_BY_YearBuilt,YearRemodAdd_BY_OverallCond,YearBuilt,GarageYrBlt,2ndFlrSF,LotShape,MasVnrArea_BY_OverallCond,BsmtFinType1,GarageArea_BY_OverallQual
0,-83,2.0,136,7,21,-21066,-19094,-247,1097,-21067,...,9101,58,5,1978,1978,0,0.0,28,2,217
2,-281,2.0,-72,6,-176,-21066,-19094,-445,773,-21066,...,7540,-150,5,1978,1978,0,0.0,-75,2,33
3,-147,2.0,-20,10,-62,-15530,-13558,-331,505,-15530,...,10970,-98,5,1978,1981,0,0.0,-75,2,139


In [6]:
#x_train.info()

In [7]:
model_list = []
rmse_list = []
r2_list = []
cat_cols = ['Neighborhood','MoSold','YearBuilt']
#cat_cols = ['OverallQual','Neighborhood','OverallCond','BsmtFinType1','MoSold']
#cat_cols = ['OverallQual','Neighborhood']

skf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for idx_train, idx_valid in list(skf.split(x_train, y_train)):
    x_cv = x_train.iloc[idx_train]
    y_cv = y_train.iloc[idx_train]
    x_valid = x_train.iloc[idx_valid]
    y_valid = y_train.iloc[idx_valid]
    #
    model = CatBoostRegressor(iterations=1000, learning_rate=0.01, cat_features=cat_cols, early_stopping_rounds=10, verbose=False)
    model.fit(x_cv, y_cv, eval_set=[(x_valid, y_valid)])
    #model = lgb.LGBMRegressor(random_state=SEED, verbose=-1)
    #model.fit(x_cv, y_cv, eval_set=[(x_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()])
    pred_valid = model.predict(x_valid)
    model_list.append(model)
    rmse_list.append(mean_squared_error(y_valid, pred_valid, squared=False))
    #rmse_list.append(mean_squared_error(np.log(y_valid), np.log(pred_valid), squared=False))
    r2_list.append(r2_score(y_valid, pred_valid))

In [8]:
print("rmse(cv)：avg={}".format(np.mean(rmse_list)))
print("rmse(cv)：std={}".format(np.std(rmse_list)))
print("r2(cv)：avg={}".format(np.mean(r2_list)))
print("r2(cv)：std={}".format(np.std(r2_list)))

rmse(cv)：avg=0.12266601065989184
rmse(cv)：std=0.006110174503074167
r2(cv)：avg=0.9052362254331981
r2(cv)：std=0.007012560042117804


In [9]:
# submission
df_sub = df.loc[(df['train_test']==3), :].drop(columns=['train_test','Id','SalePrice'], axis=1)
print(df_sub.shape)
df_sub.head(3)

(1459, 69)


Unnamed: 0,1stFlrSF_BY_Neighborhood,GarageCars,TotalBsmtSF_BY_OverallCond,MoSold,1stFlrSF_BY_OverallCond,Neighborhood_OverallCond,Neighborhood_YearBuilt,GrLivArea_BY_OverallCond,BsmtFinSF1,Neighborhood_OverallQual,...,LotArea,TotalBsmtSF_BY_YearBuilt,YearRemodAdd_BY_OverallCond,YearBuilt,GarageYrBlt,2ndFlrSF,LotShape,MasVnrArea_BY_OverallCond,BsmtFinType1,GarageArea_BY_OverallQual
1,0,2.0,-96,5,85,-15530,-13558,-183,781,-15531,...,12250,-174,5,1978,1978,0,0.0,104,0,143
4,108,3.0,34,4,193,-15530,-13558,-75,595,-15530,...,10385,-43,5,1978,1989,0,0.0,47,0,235
7,359,2.0,466,8,582,8934,10906,313,363,8934,...,10928,388,13,1978,1978,0,3.0,25,3,47


In [10]:
df_out = df.loc[(df['train_test']==3), :][['Id']]
print(df_out.shape)
df_out.head(3)

(1459, 1)


Unnamed: 0,Id
1,2171
4,2844
7,1639


In [11]:
# 予測
for i in range(len(model_list)):
  ary_pred = model_list[i].predict(df_sub)
  df_out["pred_{}".format(i)] = np.exp(ary_pred)
df_out.head(3)

Unnamed: 0,Id,pred_0,pred_1,pred_2,pred_3,pred_4
1,2171,156242.059039,154560.47273,156163.429582,155707.556988,156894.433246
4,2844,159867.265126,165591.181982,161074.176655,159243.408216,158139.278611
7,1639,193095.16989,197937.385341,194170.952213,193050.577798,192963.71674


In [12]:
df_out['SalePrice'] = np.mean(df_out.iloc[:, 1:len(model_list)+1], axis=1)
df_out = df_out[['Id', 'SalePrice']]
df_out.head(3)

Unnamed: 0,Id,SalePrice
1,2171,155913.590317
4,2844,160783.062118
7,1639,194243.560396


In [13]:
df_out.to_csv(OUTPUT_PATH + OUTPUT_FILE, index=False)
df_out.head(3)

Unnamed: 0,Id,SalePrice
1,2171,155913.590317
4,2844,160783.062118
7,1639,194243.560396
