Credit to https://www.kaggle.com/bguberfain/naive-xgb-lb-0-317

In [153]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as sk
import sklearn.preprocessing as skp
import sklearn.metrics as skm
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import cross_val_score, train_test_split

%matplotlib inline

ModuleNotFoundError: No module named 'hyperopt'

In [3]:
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [17]:
df_train = pd.read_csv("../data/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../data/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../data/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

df_train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [5]:
ax = df_train['price_doc'].hist(bins=50)

I want to prevent leakage wherever possible, so I will not be combining my train/test dataframes.

In [22]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130

X_train = df_train.drop(['id', 'price_doc'], axis = 1)
y_train = np.log1p(df_train['price_doc'].values)

X_test = df_test.drop(['id'], axis = 1)

In [23]:
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

X_train = pd.merge_ordered(X_train, df_macro, on='timestamp', how='left')
X_test = pd.merge_ordered(X_test, df_macro, on='timestamp', how='left')

# Taking advice from: https://bigdatascientistblog.wordpress.com/2015/10/02/feature-engineering-with-dates-part-1/

In [35]:
# Adding basic date information
# Not including year-month/year-week counts yet
X_train['year'] = X_train.timestamp.map(lambda date: date.year)
X_train['month'] = X_train.timestamp.map(lambda date: date.month)
X_train['day'] = X_train.timestamp.map(lambda date: date.day)
X_train['weekday'] = X_train.timestamp.map(lambda date: date.weekday())
X_train['week'] = X_train.timestamp.map(lambda date: int(date.strftime('%V')))

X_test['year'] = X_test.timestamp.map(lambda date: date.year)
X_test['month'] = X_test.timestamp.map(lambda date: date.month)
X_test['day'] = X_test.timestamp.map(lambda date: date.day)
X_test['weekday'] = X_test.timestamp.map(lambda date: date.weekday())
X_test['week'] = X_test.timestamp.map(lambda date: int(date.strftime('%V')))

In [38]:
# Other feature engineering
X_train['rel_floor'] = X_train['floor'] / X_train['max_floor'].astype(float)
X_train['rel_kitch_sq'] = X_train['kitch_sq'] / X_train['full_sq'].astype(float)

X_test['rel_floor'] = X_test['floor'] / X_test['max_floor'].astype(float)
X_test['rel_kitch_sq'] = X_test['kitch_sq'] / X_test['full_sq'].astype(float)

In [39]:
# Remove timestamp column (may overfit the model in train)
X_train.drop(['timestamp'], axis=1, inplace=True)
X_test.drop(['timestamp'], axis=1, inplace=True)

In [78]:
# Deal with categorical values
X_train_numeric = X_train.select_dtypes(exclude=['object']).copy()
X_train_obj = X_train.select_dtypes(include=['object']).copy()

X_test_numeric = X_test.select_dtypes(exclude=['object']).copy()
X_test_obj = X_test.select_dtypes(include=['object']).copy()

In [79]:
# Drop Infinite Values

X_train_numeric = X_train_numeric.replace([np.inf, -np.inf], np.nan)
X_test_numeric = X_test_numeric.replace([np.inf, -np.inf], np.nan)

In [80]:
# Could also experiment with a scaling and imputing strategies
imputer = skp.Imputer(strategy='median')
scaler = skp.StandardScaler()

X_train_numeric[:] = imputer.fit_transform(X_train_numeric)
X_test_numeric[:] = imputer.transform(X_test_numeric)

In [81]:
X_train_numeric[:] = scaler.fit_transform(X_train_numeric)
X_test_numeric[:] = scaler.fit_transform(X_test_numeric)

In [92]:
print(X_train_numeric.shape)
print(X_test_numeric.shape)

(30471, 294)
(7662, 294)


In [94]:
#Concat to get all categories

X_all_obj = pd.concat([X_train_obj, X_test_obj])

X_all_dummies = pd.get_dummies(X_all_obj, drop_first = True)

# Split back
X_train_obj = X_all_dummies.iloc[:X_train_obj.shape[0], :]
X_test_obj = X_all_dummies.iloc[X_train.shape[0]:, :]

In [95]:
X_train = pd.concat([X_train_numeric, X_train_obj], axis=1)
X_test = pd.concat([X_test_numeric, X_test_obj], axis=1)

In [169]:
xgb_params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'silent': 1,
    'n_estimators': 400,
    'nthread': -1,
    'reg_alpha': 16
}

boosted_trees = xgb.XGBRegressor(**xgb_params)

In [144]:
X_early_train, X_valid, y_early_train, y_valid = train_test_split(X_train, y_train, test_size= 0.2)

In [145]:
cross_val_score(boosted_trees, X_train, y_train, cv = 5).mean()

In [165]:
boosted_trees.fit(X=X_early_train, y=y_early_train, early_stopping_rounds=20, eval_set=[(X_valid, y_valid)])

[0]	validation_0-rmse:14.3621
Will train until validation_0-rmse hasn't improved in 20 rounds.
[1]	validation_0-rmse:13.6451
[2]	validation_0-rmse:12.9637
[3]	validation_0-rmse:12.3167
[4]	validation_0-rmse:11.7021
[5]	validation_0-rmse:11.1182
[6]	validation_0-rmse:10.5638
[7]	validation_0-rmse:10.0371
[8]	validation_0-rmse:9.53655
[9]	validation_0-rmse:9.06129
[10]	validation_0-rmse:8.60983
[11]	validation_0-rmse:8.18107
[12]	validation_0-rmse:7.77393
[13]	validation_0-rmse:7.3872
[14]	validation_0-rmse:7.01994
[15]	validation_0-rmse:6.67126
[16]	validation_0-rmse:6.3398
[17]	validation_0-rmse:6.02529
[18]	validation_0-rmse:5.72649
[19]	validation_0-rmse:5.44276
[20]	validation_0-rmse:5.17346
[21]	validation_0-rmse:4.91747
[22]	validation_0-rmse:4.67464
[23]	validation_0-rmse:4.44405
[24]	validation_0-rmse:4.22515
[25]	validation_0-rmse:4.01744
[26]	validation_0-rmse:3.82013
[27]	validation_0-rmse:3.63309
[28]	validation_0-rmse:3.45543
[29]	validation_0-rmse:3.28677
[30]	validation_0

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=16, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.7)

In [170]:
boosted_trees.fit(X=X_train, y=y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='reg:linear', reg_alpha=16, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=0.7)

In [171]:
y_test = boosted_trees.predict(X_test)

In [172]:
y_test = np.exp(y_test) - 1

In [173]:
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_test})

df_sub.to_csv('second_submission.csv', index=False)