## New York City Taxi Trip Duration
https://www.kaggle.com/c/nyc-taxi-trip-duration/data

## 学習

In [1]:
# 学習データの読み込み
import pandas as pd

train_data = pd.read_csv("./data/train.csv")

In [2]:
# 説明変数と目的変数に分ける
X = train_data.iloc[:,:-1]
Y = train_data['trip_duration']

In [3]:
# vendor_idをワンホット化
import numpy as np

def toOneHot(target_data):
    n_labels = len(np.unique(target_data))
    return np.eye(n_labels+1)[target_data]

def toOneHotDataFrame(target_data, label_prefix=''):
    oneHotData = toOneHot(target_data)
    oneHotDataFrame = pd.DataFrame(oneHotData)
    # カラム名を書き換え
    if(label_prefix!=''):
        oneHotDataFrame.columns = map(lambda x: '{}_{}'.format(label_prefix, x), oneHotDataFrame.columns)
    return oneHotDataFrame

In [4]:
label_prefix = 'vendor_id'
# print(toOneHotDataFrame(X[label_prefix], label_prefix))
# 使用する説明変数を選択
X_target = X.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame = toOneHotDataFrame(X[label_prefix], label_prefix)
# ワンホットしたやつを結合している
X_selected = pd.concat([X_target, oneHotVendorIdDataFrame], axis=1)

In [5]:
# 学習用データと検証用データに分ける
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=0)

In [6]:
# 予測誤差の計算用
# 　https://www.kaggle.com/marknagelberg/rmsle-function
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [7]:
# callback
filename = 'keras_regressor'
log_filepath = './log_files/{}'.format(filename)

import os

def make_my_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

from keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
import numpy as np
# TensorBoard
tensor_board_path = os.path.join(log_filepath,'tensor_board')
make_my_dir(tensor_board_path)
tb_cb = TensorBoard(log_dir=tensor_board_path, histogram_freq=1)

# ModelCheckpoint
weights_path = os.path.join(log_filepath,'weights')
make_my_dir(weights_path)
weights_file_name = os.path.join(weights_path,'weights.{epoch:02d}-{loss:.2f}.hdf5')
cp_cb = ModelCheckpoint(filepath = weights_file_name, monitor='loss', verbose=0, save_best_only=True, mode='auto')

# LearningRateScheduler
def make_lr_cb(nb_epoch = 200):
    learning_rates = np.logspace(-2,-4, nb_epoch)
    lr_cb = LearningRateScheduler(lambda epoch: float(learning_rates[epoch]))
    return lr_cb

Using TensorFlow backend.


In [10]:
# hyperoptのドキュメント
# https://github.com/hyperopt/hyperopt/wiki/FMin
from hyperopt import fmin, tpe, hp, rand
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers.normalization import BatchNormalization

# 走査対象のパラメータ
parameters = {
            'units1': hp.choice('units1', np.arange(64, 1024+1, dtype=int)),
            'units2': hp.choice('units2', np.arange(64, 1024+1, dtype=int)),

            'dropout1': hp.uniform('dropout1', .25,.75),
            'dropout2': hp.uniform('dropout2',  .25,.75),

            'batch_size' : hp.choice('batch_size', np.arange(8, 128+1, dtype=int)),

            'nb_epochs' :  100,
            'optimizer': hp.choice('optimizer',['adadelta','adam','rmsprop','sgd']),
            'activation': 'relu'
        }

def regression_model(params):
    # create model
    model = Sequential()
    model.add(Dense(params['units1'], input_dim=8, kernel_initializer='normal', activation=params['activation']))
    model.add(BatchNormalization())
    model.add(Dense(params['units2'], kernel_initializer='normal', activation=params['activation']))
    model.add(BatchNormalization())
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='mean_squared_logarithmic_error', optimizer=params['optimizer'])
    return model

def learningFunction(params):
    # 学習率変更用のコールバック
    lr_cb = make_lr_cb(params['nb_epochs'])
    
    # 学習
    estimator = regression_model(params)
    estimator.fit(X_train.as_matrix(), y_train.as_matrix(), epochs=params['nb_epochs'], batch_size=params['batch_size'], verbose=1, callbacks=[tb_cb, cp_cb, lr_cb])

    # 予測
    X_test_reindexed = X_test.reset_index(drop=True)
    y_test_reindexed = y_test.reset_index(drop=True)
    y_pred = estimator.predict(X_test_reindexed.as_matrix())

    # マイナスになっちゃう奴はひとまず値を反転
    for i, yp in enumerate(y_pred):
        if yp < 0:
            y_pred[i] = abs(yp)

    # 予測誤差の計算
    return rmsle(y_test_reindexed, y_pred)

# パラメータ良い奴自動選択
best = fmin(learningFunction,parameters,algo=tpe.suggest,max_evals=200)

Epoch 1/100
  62891/1166915 [>.............................] - ETA: 348s - loss: 35.6605

KeyboardInterrupt: 

In [9]:
best

NameError: name 'best' is not defined

In [None]:
# ベストなモデルに全データぶっ込んで学習
xgbr = XGBRegressor(**best)
xgbr.fit(X_selected.as_matrix(), Y.as_matrix())

## 予測

In [None]:
# テストデータの読み込み
test_data = pd.read_csv("./data/test.csv")

In [None]:
# 使用する説明変数を選択
test_data_target = test_data.loc[:,['passenger_count','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
oneHotVendorIdDataFrame_test_data = toOneHotDataFrame(test_data[label_prefix], label_prefix)
# ワンホットしたやつを結合している
test_data_selected = pd.concat([test_data_target, oneHotVendorIdDataFrame_test_data], axis=1)

In [None]:
# 予測
result = estimator.predict(test_data_selected.as_matrix())

In [None]:
# マイナスになっちゃう奴はひとまず値を反転
for i, yp in enumerate(result):
    if yp < 0:
        result[i] = abs(yp)

In [None]:
# 提出できる形式に変換
id_list = test_data['id']
result_dataFrame = pd.DataFrame([id_list, result], index = ['id', 'trip_duration']).T

In [None]:
# ファイル書き出し
from datetime import datetime as dt

tdatetime = dt.now()
tstr = tdatetime.strftime('%Y%m%d_%H%M')
result_dataFrame.to_csv('{}_submission_{}.csv'.format(filename,tstr), index=False)