In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

pd.set_option('display.max_rows', 1100)
pd.set_option('display.max_columns', 1100)
warnings.filterwarnings('ignore')

train_df = pd.read_csv("../../input/train.csv")
test_df = pd.read_csv("../../input/test.csv")

# 特徴量作る

In [11]:
# ベース特徴りょう
print("base")
for df in [train_df, test_df]:
    # hour : hh:01 ~ hh+1:00
    df["dayofWeek"] = df["date"].map(lambda x : datetime.datetime(x//10000, (x%10000)//100, (x%100)).strftime('%A'))
    df["hour"] = df["planArrival"].map(lambda x : int(x[:2].replace(":", "")) if int(x[3:]) != 0 else int(x[:2].replace(":", ""))-1  )
    df["planArrival_int"] = df["planArrival"].map(lambda x : int(x.replace(":", "")))
    df["ampm"] = df["hour"].map(lambda x : "am" if x <= 14 else "pm")
    df["date_trainNo"] = df["date"].astype("str") + " * " + df["trainNo"]
    
 # 前の1時間を使う特徴量 
print("rug")
for df in [train_df, test_df]:
    df["hour-1"] = df["hour"] - 1
    # その日の前の1時間の遅延平均
    mean_df = df.groupby(["date", "hour-1"])["delayTime"].mean().reset_index().rename(columns={"delayTime" : "hour-1_targetMean"})
    df["hour-1_targetMean"] = pd.merge(df, mean_df, left_on=["date", "hour"], right_on=["date", "hour-1"])["hour-1_targetMean"]

base
rug


# 入力作る

In [13]:
train_df[:3]

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,dayofWeek,hour,planArrival_int,ampm,date_trainNo,hour-1,hour-1_targetMean
0,0,20191204,A,1,AFGKvJ,Rlfq,06:11,0.0,Wednesday,6,611,am,20191204 * AFGKvJ,5,0.121468
1,1,20191204,A,1,AFGKvJ,coZB,06:13,0.0,Wednesday,6,613,am,20191204 * AFGKvJ,5,0.121468
2,2,20191204,A,1,AFGKvJ,LMww,06:16,0.0,Wednesday,6,616,am,20191204 * AFGKvJ,5,0.121468


In [16]:
#朝8時を訓練、朝9時を予測
# 日付は全て入れる
from sklearn.preprocessing import LabelEncoder
cv_col = "hour"
drop_cols = ["id", "planArrival", "ampm", "date_trainNo", "hour-1"]
lbl_cols = ["lineName", "trainNo", "stopStation", "dayofWeek"]
target_col = "delayTime"

train_X = train_df.copy()
for col in lbl_cols:
    train_X[col] = LabelEncoder().fit_transform(train_X[col])
    
train_X = train_X.drop(columns=drop_cols)

valid_y = train_X[train_X["hour"] == 9][target_col]
valid_X = train_X[train_X["hour"] == 9].drop(columns=[target_col, cv_col])
train_y = train_X[train_X["hour"] == 8][target_col]
train_X = train_X[train_X["hour"] == 8].drop(columns=[target_col, cv_col])

train_X[:5]

Unnamed: 0,date,lineName,directionCode,trainNo,stopStation,dayofWeek,planArrival_int,hour-1_targetMean
67,20191204,0,1,37,71,6,801,0.121468
68,20191204,0,1,37,55,6,804,0.121468
69,20191204,0,1,37,9,6,806,0.121468
153,20191204,0,1,116,37,6,831,0.121468
154,20191204,0,1,116,53,6,833,0.121468


# モデル定義・学習

In [None]:
import lightgbm as lgb

md = 39
model_param = {
    "max_depth": md,
    "n_estimators": 10000,
    "colsample_bytree": 0.8,
    "num_leaves": int(.7 * md ** 2),
    "learning_rate": 0.1,
    "objective": "mae",
    "verbose": -1,
    "min_child_samples": 120,
    "early_stopping_rounds": 100,
}

train = lgb.Dataset(train_X, train_y)
valid = lgb.Dataset(valid_X, valid_y)

model = lgb.train(
    model_param,
    train, 
    valid_sets=valid, 
    verbose_eval = 200
)

Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 1.33877
[400]	valid_0's l1: 1.3287
[600]	valid_0's l1: 1.3224
[800]	valid_0's l1: 1.31866
[1000]	valid_0's l1: 1.31597
[1200]	valid_0's l1: 1.31345
[1400]	valid_0's l1: 1.31168
[1600]	valid_0's l1: 1.30963
[1800]	valid_0's l1: 1.30866
[2000]	valid_0's l1: 1.30727
[2200]	valid_0's l1: 1.30616
[2400]	valid_0's l1: 1.30544
[2600]	valid_0's l1: 1.30487


md=15
md=16 1.33761
md=17 1.34
md=18 1.34
md=19 1.34
md=20 1.339
md=21 1.34
md=22 1.310
md=23 1.314
md=24 1.326
md=25 1.
md=26 1.
md=27 1.315
md=28 1.
md=29 1.
md=30 1.30244
md=31 1.
md=32 1.
md=33 1.305
md=34 1.
md=35 1.
md=36 1.
md=37 1.
md=38 1.
md=39 1.2999
md=40 1.
md=41 1.
md=42 1.
md=43 1.
md=44 1.
md=45 1.301
md=46 1.
md=47 1.
md=48 1.
md=49 1.
md=50 1.
md=51 1.301