## New York City Taxi Trip Duration
https://www.kaggle.com/c/nyc-taxi-trip-duration/data

## 学習

In [1]:
# 学習データの読み込み
import pandas as pd

train_data = pd.read_csv("./data/train.csv")

In [2]:
# 説明変数と目的変数に分ける
X = train_data.iloc[:,:-1]
Y = train_data['trip_duration']

In [3]:
# vendor_idをワンホット化
import numpy as np

def toOneHot(target_data):
    n_labels = len(np.unique(target_data))
    return np.eye(n_labels+1)[target_data]

def toOneHotDataFrame(target_data, label_prefix=''):
    oneHotData = toOneHot(target_data)
    oneHotDataFrame = pd.DataFrame(oneHotData)
    # カラム名を書き換え
    if(label_prefix!=''):
        oneHotDataFrame.columns = map(lambda x: '{}_{}'.format(label_prefix, x), oneHotDataFrame.columns)
    return oneHotDataFrame

In [4]:
label_prefix = 'vendor_id'
# print(toOneHotDataFrame(X[label_prefix], label_prefix))
# 使用する説明変数を選択
X_target = X.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame = toOneHotDataFrame(X[label_prefix], label_prefix)
# ワンホットしたやつを結合している
X_selected = pd.concat([X_target, oneHotVendorIdDataFrame], axis=1)

In [5]:
# 学習用データと検証用データに分ける
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=0)

In [6]:
# 予測誤差の計算用
# 　https://www.kaggle.com/marknagelberg/rmsle-function
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [12]:
# hyperoptのドキュメント
# https://github.com/hyperopt/hyperopt/wiki/FMin
from hyperopt import fmin, tpe, hp, rand
import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# 走査対象のパラメータ
parameters = {'choice': hp.choice('num_layers',
                    [ {'layers':'two', },
                    {'layers':'three',
                    'units3': hp.uniform('units3', 64,1024), 
                    'dropout3': hp.uniform('dropout3', .25,.75)}
                    ]),

            'units1': hp.uniform('units1', 64,1024),
            'units2': hp.uniform('units2', 64,1024),

            'dropout1': hp.uniform('dropout1', .25,.75),
            'dropout2': hp.uniform('dropout2',  .25,.75),

            'batch_size' : hp.randint('batch_size', 8,10),

            'nb_epochs' :  100,
            'optimizer': hp.choice('optimizer',['adadelta','adam','rmsprop','sgd']),
            'activation': 'relu'
        }

def regression_model(params):
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_logarithmic_error', optimizer='sgd')
    return model

def learningFunction(params):
    print(params)
    # 学習
    estimator = regression_model(params)
    estimator.fit(X_train.as_matrix(), y_train.as_matrix(), epochs=params['nb_epochs'], batch_size=params['batch_size'], verbose=0, callbacks=[])

    # 予測
    X_test_reindexed = X_test.reset_index(drop=True)
    y_test_reindexed = y_test.reset_index(drop=True)
    y_pred = xgbr.predict(X_test_reindexed)

    # マイナスになっちゃう奴はひとまず値を反転
    for i, yp in enumerate(y_pred):
        if yp < 0:
            y_pred[i] = abs(yp)

    # 予測誤差の計算
    return rmsle(y_test_reindexed, y_pred)

# パラメータ良い奴自動選択
best = fmin(learningFunction,parameters,algo=tpe.suggest,max_evals=200)

{'units1': 507.5217219183247, 'units2': 933.761473186711, 'optimizer': 'sgd', 'activation': 'relu', 'batch_size': 8.841005140854715, 'choice': {'layers': 'two'}, 'nb_epochs': 100, 'dropout2': 0.3487734566527282, 'dropout1': 0.4870233754154215}


KeyboardInterrupt: 

In [None]:
best

In [None]:
# ベストなモデルに全データぶっ込んで学習
xgbr = XGBRegressor(**best)
xgbr.fit(X_selected.as_matrix(), Y.as_matrix())

## 予測

In [29]:
# テストデータの読み込み
test_data = pd.read_csv("./data/test.csv")

In [30]:
# 使用する説明変数を選択
test_data_target = test_data.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame_test_data = toOneHotDataFrame(test_data[label_prefix], label_prefix)
# ワンホットしたやつを結合している
test_data_selected = pd.concat([test_data_target, oneHotVendorIdDataFrame_test_data], axis=1)

In [33]:
# 予測
result = estimator.predict(test_data_selected.as_matrix())

In [36]:
# マイナスになっちゃう奴はひとまず値を反転
for i, yp in enumerate(result):
    if yp < 0:
        result[i] = abs(yp)

In [37]:
# 提出できる形式に変換
id_list = test_data['id']
result_dataFrame = pd.DataFrame([id_list, result], index = ['id', 'trip_duration']).T

In [38]:
# ファイル書き出し
from datetime import datetime as dt

tdatetime = dt.now()
tstr = tdatetime.strftime('%Y%m%d_%H%M')
result_dataFrame.to_csv('{}_submission_{}.csv'.format('keras_regressor',tstr), index=False)