In [76]:
import catboost
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb

from catboost import Pool, CatBoostRegressor, CatBoostClassifier, cv
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.pylab import rcParams
from mlxtend.preprocessing import DenseTransformer

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, LabelEncoder


# matplotlib settings
matplotlib.style.use('ggplot')
rcParams['figure.figsize'] = 12, 8
rcParams['font.size'] = 12
rcParams['axes.facecolor'] = 'white'

In [2]:
# ! pip install jupyternotify
%load_ext jupyternotify

<IPython.core.display.Javascript object>

# Data Loading

In [66]:
df_train = pd.read_csv('../data/housing_usa/train.csv')
df_train.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [67]:
y_train = np.log1p(df_train['SalePrice'])
df_train = df_train.drop('SalePrice', axis = 1)

In [68]:
cat_features = list(df_train.select_dtypes(include=['object']).columns.values)
cat_indices = [df_train.columns.get_loc(name) for name in cat_features]

df_train[cat_features] = df_train[cat_features].astype(str)

In [69]:
pool_train = Pool(df_train, label = y_train, cat_features=cat_indices)

In [64]:
%%notify
model = CatBoostRegressor(learning_rate = 0.1, loss_function = 'RMSE', custom_metric = 'RMSE',  calc_feature_importance = True)

cv_params = model.get_params()
cv_params['logging_level'] = 'Silent'
del cv_params['calc_feature_importance']
cv_data = cv(cv_params, pool_train, fold_count=3, shuffle = True)

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [65]:
print('RMSLE (CatBoost) = {0}'.format(cv_data['RMSE_test_avg'][-1])) 

RMSLE (CatBoost) = 0.198284549786


# XGBoost

In [46]:
df_train_dummy = pd.get_dummies(df_train, columns = cat_features)

In [33]:
xgboost_clf = Pipeline([('to_dense', DenseTransformer()), 
                        ('clf', xgb.XGBRegressor(eval_metric = 'rmse'))])

In [34]:
xgboost_cv = cross_val_score(xgboost_clf, df_train_dummy, y_train, 
                             scoring='neg_mean_squared_error', cv=3, n_jobs = -1, verbose=True)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.5s finished


In [35]:
print('RMSLE (XGBoost) = {0}'.format(np.sqrt(-xgboost_cv.mean())))

RMSLE (XGBoost) = 0.133228328476


# GradientBoostingRegressor

In [36]:
sk_boost_clf = Pipeline([('replace_nan', Imputer()),
                        ('to_dense', DenseTransformer()), 
                        ('clf', GradientBoostingRegressor())])

In [37]:
sklearn_cv = cross_val_score(sk_boost_clf, df_train_dummy, y_train, 
                     scoring='neg_mean_squared_error', cv=5, n_jobs = -1, verbose=True)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.4s finished


In [38]:
print('RMSLE (GradientBoostingClassifier) = {0}'.format(np.sqrt(-sklearn_cv.mean())))

RMSLE (GradientBoostingClassifier) = 0.127356724653


# CatBoost on dummy variables

In [50]:
pool_train_dummy = Pool(df_train_dummy, label = y_train)

In [51]:
%%notify
cv_data_dummy = cv(cv_params, pool_train_dummy, fold_count=3, shuffle = True)

<IPython.core.display.Javascript object>

In [53]:
print('RMSLE (CatBoost Dummy Variables) = {0}'.format(cv_data_dummy['RMSE_test_avg'][-1])) 

RMSLE (CatBoost Dummy Variables) = 0.240938230887


# LightGBM 
 
 [usage example](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)

In [70]:
import lightgbm as lgb

In [77]:
df_train[cat_features] = df_train[cat_features].apply(LabelEncoder().fit_transform)

In [81]:
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)

In [None]:
lgb_cv = cross_val_score(gbm, df_train, y_train, 
                     scoring='neg_mean_squared_error', cv=5, n_jobs = -1, verbose=True)

In [None]:
print('RMSLE (LightGBM) = {0}'.format(lgb_cv['RMSE_test_avg'][-1])) 