In [164]:
import sys
src_path = "../src/"
if src_path not in sys.path:
    sys.path.append(src_path)

from helpers_module import helpers as hlp


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt


from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, MissingIndicator

import xgboost as xgb
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error,mean_squared_log_error

## Load data

In [165]:
RANDOM_STATE=27

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
test_id_col = test_df['Id']

union_df = pd.concat([train_df, test_df])

train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## Common preprocessing

In [166]:
# drop train rows without target value
train_df.dropna(subset = ['SalePrice'], inplace=True)

target = train_df['SalePrice']
train_df.drop(['SalePrice'], axis=1, inplace=True)


# hard list of columns to drop
columns_to_drop_hard = ['Id']

train_df = train_df.drop(columns_to_drop_hard, axis=1)
test_df = test_df.drop(columns_to_drop_hard, axis=1)


# drop columns by missing threshold
train_df, columns_to_drop = hlp.drop_cols_with_missing_more_threshold(train_df, 0.5)
test_df = test_df.drop(columns_to_drop, axis=1)

print("Dropped: ", columns_to_drop_hard, columns_to_drop)

str_cols = hlp.get_str_cols(train_df)
numeric_cols = hlp.get_numeric_cols(train_df)

feature_names = train_df.columns

Dropped:  ['Id'] ['Alley' 'PoolQC' 'Fence' 'MiscFeature']


## Categorial columns processing

In [167]:
cat=4

# 1. Just drop all categorial columns
if cat == 1:
    train_df = hlp.drop_str_cols(train_df)
    test_df = hlp.drop_str_cols(test_df)

# ------ OR ------

# 2. Impute most freq and encode by simple labels
if cat == 2:
    train_df[str_cols] = hlp.impute(train_df[str_cols], strategy='most_frequent')
    test_df[str_cols] = hlp.impute(test_df[str_cols], strategy='most_frequent')

    train_df[str_cols] = hlp.encode_with_labels(train_df[str_cols])
    test_df[str_cols] = hlp.encode_with_labels(test_df[str_cols])

# ------ OR ------

# 3. Impute most freq and encode by one hot
if cat == 3:
    train_df[str_cols] = hlp.impute(train_df[str_cols], strategy='most_frequent')
    test_df[str_cols] = hlp.impute(test_df[str_cols], strategy='most_frequent')

    train_df, test_df = hlp.encode_with_one_hot([train_df, test_df])

# ------ OR ------

# 4. Encode by simple labels and impute mean of labels
if cat == 4:
    train_df[str_cols] = hlp.encode_with_labels_and_impute(train_df[str_cols], strategy='mean')
    test_df[str_cols] = hlp.encode_with_labels_and_impute(test_df[str_cols], strategy='mean')

## Numerical columns processing

In [168]:
train_df[numeric_cols] = hlp.impute(train_df[numeric_cols], strategy='mean')
test_df[numeric_cols] = hlp.impute(test_df[numeric_cols], strategy='mean')

# min_max_scaler = MinMaxScaler()

# train_df[:] = min_max_scaler.fit_transform(train_df)
# test_df[:] = min_max_scaler.fit_transform(test_df)

## Prepearing for model

In [169]:
X = train_df
y = target

X_test = test_df


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=RANDOM_STATE)

## Research Model

In [170]:
xgb_params = {
    "learning_rate": 0.05,
    "max_depth": 3
}

## DMatrix notation

# dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
# dvalid = xgb.DMatrix(X_valid, label=y_valid, feature_names=feature_names)

# reserch_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=5, 
#           evals=[(dvalid,'valid')], verbose_eval=True)

# y_train_pred = reserch_model.predict(dtrain)
# y_valid_pred = reserch_model.predict(dvalid)


reserch_model = XGBRegressor(n_estimators=1000, **xgb_params)

reserch_model.fit(X_train, y_train, early_stopping_rounds=5, 
          eval_set=[(X_valid, y_valid)], eval_metric='rmsle', verbose=True)

y_train_pred = reserch_model.predict(X_train)
y_valid_pred = reserch_model.predict(X_valid)

print("RMSLE train: {:.5f}".format(sqrt(mean_squared_log_error(y_train, y_train_pred))))
print("RMSLE: {:.5f}".format(sqrt(mean_squared_log_error(y_valid, y_valid_pred))))
print("R2: {:.5f}".format(r2_score(y_valid, y_valid_pred)))
print("Best: {:.5f}, iter={:d}".format(reserch_model.best_score, reserch_model.best_iteration))

[0]	validation_0-rmsle:2.98979
[1]	validation_0-rmsle:2.31469
[2]	validation_0-rmsle:1.93829
[3]	validation_0-rmsle:1.67534
[4]	validation_0-rmsle:1.47951
[5]	validation_0-rmsle:1.32115
[6]	validation_0-rmsle:1.19327
[7]	validation_0-rmsle:1.08259
[8]	validation_0-rmsle:0.99012
[9]	validation_0-rmsle:0.91098
[10]	validation_0-rmsle:0.84077
[11]	validation_0-rmsle:0.77745
[12]	validation_0-rmsle:0.72210
[13]	validation_0-rmsle:0.67213
[14]	validation_0-rmsle:0.62742
[15]	validation_0-rmsle:0.58678
[16]	validation_0-rmsle:0.54990
[17]	validation_0-rmsle:0.51683
[18]	validation_0-rmsle:0.48612
[19]	validation_0-rmsle:0.45716
[20]	validation_0-rmsle:0.43131
[21]	validation_0-rmsle:0.40779
[22]	validation_0-rmsle:0.38706
[23]	validation_0-rmsle:0.36705
[24]	validation_0-rmsle:0.34837
[25]	validation_0-rmsle:0.33154
[26]	validation_0-rmsle:0.31617
[27]	validation_0-rmsle:0.30153
[28]	validation_0-rmsle:0.28830
[29]	validation_0-rmsle:0.27615
[30]	validation_0-rmsle:0.26506
[31]	validation_0-

[252]	validation_0-rmsle:0.12728
[253]	validation_0-rmsle:0.12727
[254]	validation_0-rmsle:0.12727
[255]	validation_0-rmsle:0.12728
[256]	validation_0-rmsle:0.12730
[257]	validation_0-rmsle:0.12703
[258]	validation_0-rmsle:0.12697
[259]	validation_0-rmsle:0.12701
[260]	validation_0-rmsle:0.12700
[261]	validation_0-rmsle:0.12698
[262]	validation_0-rmsle:0.12702
[263]	validation_0-rmsle:0.12696
[264]	validation_0-rmsle:0.12696
[265]	validation_0-rmsle:0.12694
[266]	validation_0-rmsle:0.12693
[267]	validation_0-rmsle:0.12698
[268]	validation_0-rmsle:0.12693
[269]	validation_0-rmsle:0.12698
[270]	validation_0-rmsle:0.12702
[271]	validation_0-rmsle:0.12709
[272]	validation_0-rmsle:0.12709
RMSLE train: 0.07833
RMSLE: 0.12693
R2: 0.88199
Best: 0.12693, iter=268


In [171]:
# check by r2 score by cross validation
model_for_cross_val = XGBRegressor(n_estimators=reserch_model.best_iteration, **xgb_params)

rmsle = -cross_val_score(model_for_cross_val, X, y, scoring=('neg_mean_squared_log_error')).mean()

print("RMSLE mean: {:.5f}".format(rmsle))

RMSLE mean: 0.01614


In [172]:
# check by r2 score by cross validation
model_for_cross_val = XGBRegressor(n_estimators=reserch_model.best_iteration, **xgb_params)

r2 = cross_val_score(model_for_cross_val, X, y, scoring=('r2')).mean()

print("R2 mean: {:.5f}".format(r2))

R2 mean: 0.88732


## Train model

In [173]:
model_final = XGBRegressor(n_estimators=reserch_model.best_iteration, **xgb_params)
model_final.fit(X, y)

y_train_pred = model_final.predict(X)

print("RMSLE train: {:.5f}".format(sqrt(mean_squared_log_error(y, y_train_pred))))
print("R2 train: {:.5f}".format(r2_score(y, y_train_pred)))

RMSLE train: 0.08542
R2 train: 0.96821


## Prediction and save result

In [174]:
y_test_pred = model_final.predict(X_test)

output = pd.DataFrame({'Id': test_id_col, 'SalePrice': y_test_pred})
output.to_csv(f'../data/rmsle_{rmsle:.5f}_xgb{reserch_model.best_iteration}_r_Id_th50_cat{cat}_mxtr3.csv', index=False)