## New York City Taxi Trip Duration
https://www.kaggle.com/c/nyc-taxi-trip-duration/data

ありがたいXGBoostの入れ方

http://qiita.com/nazoking@github/items/1a0ab5570da825e9d030

## 学習

In [7]:
# 学習データの読み込み
import pandas as pd

train_data = pd.read_csv("./data/train.csv")

In [8]:
# 説明変数と目的変数に分ける
X = train_data.iloc[:,:-1]
Y = train_data['trip_duration']

In [9]:
# vendor_idをワンホット化
import numpy as np

def toOneHot(target_data):
    n_labels = len(np.unique(target_data))
    return np.eye(n_labels+1)[target_data]

def toOneHotDataFrame(target_data, label_prefix=''):
    oneHotData = toOneHot(target_data)
    oneHotDataFrame = pd.DataFrame(oneHotData)
    # カラム名を書き換え
    if(label_prefix!=''):
        oneHotDataFrame.columns = map(lambda x: '{}_{}'.format(label_prefix, x), oneHotDataFrame.columns)
    return oneHotDataFrame

In [10]:
label_prefix = 'vendor_id'
# print(toOneHotDataFrame(X[label_prefix], label_prefix))
# 使用する説明変数を選択
X_target = X.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame = toOneHotDataFrame(X[label_prefix], label_prefix)
# ワンホットしたやつを結合している
X_selected = pd.concat([X_target, oneHotVendorIdDataFrame], axis=1)

In [33]:
# 学習用データと検証用データに分ける
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=0)

In [34]:
# 予測誤差の計算用
# 　https://www.kaggle.com/marknagelberg/rmsle-function
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [39]:
from hyperopt import fmin, tpe, hp, rand
from xgboost import XGBRegressor

# 走査対象のパラメータ
parameters = {
        #Control complexity of model
        "min_child_weight" : hp.quniform('min_child_weight', 1, 10, 1),
        'gamma' : hp.quniform('gamma', 0, 1, 0.05),
        
        #Improve noise robustness 
        "subsample" : hp.quniform('subsample', 0.5, 1, 0.05),
        "colsample_bytree" : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        
        # システムメッセージを表示しない
        'silent' : 1
}

def learningFunction(params):
    # 学習
    xgbr = XGBRegressor(**params)
    xgbr.fit(X_train, y_train)

    # 予測
    X_test_dropped = X_test.reset_index(drop=True)
    y_test_dropped = y_test.reset_index(drop=True)
    y_pred = xgbr.predict(X_test_dropped)

    # マイナスになっちゃう奴はひとまず値を反転
    for i, yp in enumerate(y_pred):
        if yp < 0:
            y_pred[i] = abs(yp)

    # 予測誤差の計算
    return rmsle(y_test_dropped, y_pred)

# パラメータ良い奴自動選択
best = fmin(learningFunction,parameters,algo=tpe.suggest,max_evals=100)

In [40]:
best

{'colsample_bytree': 0.9500000000000001,
 'gamma': 0.9500000000000001,
 'min_child_weight': 4.0,
 'subsample': 0.6000000000000001}

In [41]:
# ベストなモデルに全データぶっ込んで学習
xgbr = XGBRegressor(**best)
xgbr.fit(X_selected, Y)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.95,
       gamma=0.95, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=4.0, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.6)

## 予測

In [42]:
# テストデータの読み込み
test_data = pd.read_csv("./data/test.csv")

In [43]:
# 使用する説明変数を選択
test_data_target = test_data.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame_test_data = toOneHotDataFrame(test_data[label_prefix], label_prefix)
# ワンホットしたやつを結合している
test_data_selected = pd.concat([test_data_target, oneHotVendorIdDataFrame_test_data], axis=1)

In [44]:
# 予測
result = xgbr.predict(test_data_selected)

In [56]:
# 提出できる形式に変換
id_list = test_data['id']
result_dataFrame = pd.DataFrame([id_list, result], index = ['id', 'trip_duration']).T

In [58]:
# ファイル書き出し
from datetime import datetime as dt

tdatetime = dt.now()
tstr = tdatetime.strftime('%Y%m%d_%H%M')
result_dataFrame.to_csv('submission_{}.csv'.format(tstr))