In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [3]:
train = pd.read_csv('data/train.csv', index_col=0)

X = train.drop('target', axis=1)
y = train.target

In [4]:
from sklearn.decomposition import PCA

pca = PCA(
    copy=True, iterated_power=7, n_components=100, 
    random_state=None, svd_solver='auto', tol=0.0, whiten=False
)
X_pca = pca.fit_transform(X)

In [5]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
X_pca = minmax.fit_transform(X)
y_log = np.log1p(y)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_log, test_size=0.1, random_state=42
)

In [26]:
reg = xgb.XGBRegressor(booster='dart', learning_rate=0.05, n_estimators=500, max_depth=10)
reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [8]:
def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))
    return ('RMSLE', rmsle, False)

booster gbtree

In [9]:
y_pred = reg.predict(X_test)
print(rmsle_metric(y_test, y_pred))

('RMSLE', 1.4953000504056113, False)


booster gblinear

In [11]:
y_pred = reg.predict(X_test)
print(rmsle_metric(y_test, y_pred))

('RMSLE', 1.6680378883676528, False)


booster dart

In [25]:
y_pred = reg.predict(X_test)
print(rmsle_metric(y_test, y_pred))

('RMSLE', 1.4342862562472183, False)


In [29]:
y_pred = reg.predict(X_test)
print(rmsle_metric(y_test, y_pred))

('RMSLE', 1.431818334461019, False)


In [30]:
from sklearn.externals import joblib

joblib.dump(reg, 'XGBoost-1-431.pkl')

['XGBoost-1-431.pkl']