In [1]:
#!pip install iterative-stratification
#!pip install jpholiday
#!pip install lightgbm


import pandas as pd
import pandas.tseries.offsets as offsets

import numpy as np
import datetime
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error as mae 
from sklearn.metrics import mean_squared_error as mse 


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, TimeSeriesSplit

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import time
import pickle
import logging
from contextlib import contextmanager
import jpholiday



import utils

In [2]:
@contextmanager
def timer(name, logger=None, level=logging.DEBUG):
    print_ = print if logger is None else lambda msg: logger.log(level, msg)
    t0 = time.time()
    print_(f'[{name}] start')
    yield
    print_(f'[{name}] done in {time.time() - t0:.0f} s')

In [3]:
def data_loader():
    train = pd.read_csv("JR/train.csv")
    test = pd.read_csv("JR/test.csv")
    info= pd.read_csv("JR/info.csv")

    network = pd.read_csv("JR/network.csv")
    return train, test, info, network

def LabelEncoders(df, info):
    df["stop_and_direction"] = df["stopStation"] + df["directionCode"].astype(str)
    le = LabelEncoder()
    df["stop_and_direction"] = le.fit_transform(df["stop_and_direction"])
    
    
    le = LabelEncoder()
    df["stopStation"] = le.fit_transform(df.stopStation)
    #network["station1"] = le.transform(network.station1)
    #network["station2"] = le.transform(network.station2)
    
    le = LabelEncoder()
    df["trainNo"] = le.fit_transform(df.trainNo)
    
    le = LabelEncoder()
    df["lineName"] = le.fit_transform(df.lineName)
    info["lineName"] = le.transform(info.lineName)
    
    le = LabelEncoder()
    info["cse"] = le.fit_transform(info.cse)
    
    le = LabelEncoder()
    df["stop_and_direction"] = le.fit_transform(df.stop_and_direction)
    return df, info

def time_feature(df, info):

    #make daytime
    df["date_and_time"] = df.date.astype(str) + ":"+df.planArrival.astype(str)
    df["date_and_time"] = pd.to_datetime(df["date_and_time"], format='%Y%m%d:%H:%M')
    
    
    info["date_and_time"] = info.date.astype(str) + ":"+info.time.astype(str)
    info["date_and_time"] = pd.to_datetime(info["date_and_time"], format='%Y%m%d:%H:%M')
    
    df["hour"] = df.date_and_time.dt.hour
    info["hour"] = info.date_and_time.dt.hour
    
    df["minute"] = df.date_and_time.dt.minute
    info["minute"] = info.date_and_time.dt.minute
    
    df["minute_10"] = df.minute//10
    
    df["dayofweek"] = df.date_and_time.dt.dayofweek

    df["am_pm"] = 0
    df["am_pm"][df.hour > 12] = 1
    info["am_pm"] = 0
    info["am_pm"][info.hour > 12] = 1
    
    #df["is_holiday"] = jpholiday.is_holiday(df.date_and_time.dt)*1
    dates = df.date_and_time.unique()
    date_dict = {}
    for i in dates:
        date_dict[i] = jpholiday.is_holiday(pd.to_datetime(i, format='%Y%m%d:%H:%M'))
    df["is_holiday"] = df.date_and_time.map(date_dict)
    df["is_holiday"] = df["is_holiday"] * 1
    
    df = pd.concat([df, df.planArrival.str.split(":", expand=True).rename(columns= {0:"planHour", 1:"planMinute"})], axis = 1)    
    return df, info

def set_id(df):
    df["date_id"] = df.groupby(["date", "am_pm"]).ngroup()
    return df

def target_feature(df):
    #one hour ago
    _calc_df = df.groupby(["date_id","lineName","hour"]).delayTime.mean().reset_index()
    
    _calc_df_suf_1hour = _calc_df.copy()
    _calc_df_suf_1hour.hour =  _calc_df_suf_1hour.hour + 1
    _calc_df_suf_1hour = _calc_df_suf_1hour.rename(columns = {"delayTime":"mean_suf1hour_delayTime"})
    df = df.merge(_calc_df_suf_1hour ,on = ["date_id","lineName", "hour"], how = "left")


    _calc_df = df.groupby(["date_id","lineName","directionCode","hour"]).delayTime.mean().reset_index()

    _calc_df_suf_1hour = _calc_df.copy()
    _calc_df_suf_1hour.hour =  _calc_df_suf_1hour.hour + 1
    _calc_df_suf_1hour = _calc_df_suf_1hour.rename(columns = {"delayTime":"mean_suf1hour_direction_delayTime"})
    df = df.merge(_calc_df_suf_1hour ,on = ["date_id","lineName","directionCode", "hour"], how = "left")
    
    return df



def target_feature2(df, train_num):
    df["tgt_DLH"] = -1
    tgt_col = ["dayofweek","lineName","hour"]
    _calc_df = df[:train_num].groupby(tgt_col).delayTime.mean().reset_index().rename(
        columns = {"delayTime":"DLH"})
    tmp_df = df[train_num:].merge(_calc_df, how = "left", on = tgt_col)
    #df[train_num:]["tgt_DLH"]  = tmp_df.DLH
    df.loc[train_num:,"tgt_DLH"]  = tmp_df.DLH.values
    kf = MultilabelStratifiedKFold(n_splits = 4, random_state = 14)
    tmp = np.repeat(np.nan, train_num)



    for idx_1, idx_2 in kf.split(df[:train_num], df[:train_num][tgt_col]):
        target_mean = df[:train_num].iloc[idx_1].groupby(tgt_col)[
            "delayTime"].mean().reset_index().rename(columns = {"delayTime":"DLH"})
        tmp[idx_2] = df.loc[:train_num].iloc[idx_2][tgt_col].merge(
            target_mean, how = "left", on = tgt_col)["DLH"]
    df[:train_num]["tgt_DLH"] = tmp
    return df


def cse_feature(df, info):
    n = 23
    dropcse = [0,1,2,3,4,5,6,7,8,9,10,14,17,19,11,22]
    info = info[~(info.cse.isin(dropcse))].reset_index()

    for i in range(n):
        if i in dropcse:
            continue
        df["cse_" + str(i)] = np.nan
    
    for i in range(len(info)):
        #print(i)
        tmp = df[ 
           # (info["date_and_time"].loc[i] <=df.date_and_time) &
#                        (info.date_and_time[i] <= df.date_and_time) &

            (df.am_pm == info.am_pm.loc[i]) &
            (df.date == info.date.loc[i]) &
            (df.lineName == info.lineName.loc[i])].date_and_time - info.date_and_time.loc[i] 
        tmp = tmp.to_numpy()
        df.loc[ 
           # (info.date_and_time[i] <= df.date_and_time) &
            (df.am_pm == info.am_pm.loc[i]) &
            (df.date == info.date.loc[i]) &
            (df.lineName == info.lineName[i]),"cse_" + str(info.cse[i])] =  tmp

        
    for i in range(n):
#        break
        if i in dropcse:
            continue
        df["cse_" + str(i)] = df["cse_" + str(i)] // 60000000000
        df["cse_" + str(i) + "_abs"] = df["cse_" + str(i)].abs()
    
    return df, info

def feature_value_counts(df):
    tgt_cols = [
        ["date", "lineName"],
        ["date","trainNo"],
        ["date","trainNo","directionCode"],
        ["date","trainNo","hour"],
        ["date","trainNo","hour","minute_10"],
        ["date","trainNo","am_pm"],
        ["trainNo"],
        ["date"],
        ["stopStation"],
        ["date","stopStation"],
        ["date","hour","stopStation"]
        
    ]
    for c in tgt_cols:
        df["cnt_" + "_".join(c)] = df.groupby(c)["id"].transform("count")
        #df["nid_" + "_".join(c)] = df.groupby(c).ngroup()

    return df

def model_output_feature(_df, train_num):
    train_df = _df[:train_num].copy()
    test_df = _df[train_num:].copy()
    kf = KFold(n_splits = 4, random_state = 14)    
    score = 0
    counter = 1
    models = []
    target_col = "delayTime"

    drop_col = ["planArrival", "date_and_time","delayTime","id","is_holiday","date_id","target", "am_pm","pred_delayTime"]
    params ={
        'max_depth':3,
    }
    train_df["pred_delayTime"] = -1
    test_df["pred_delayTime"] = -1
    for train_index, valid_index in kf.split(train_df.date_id.unique()):

        cv_train_data = train_df[train_df.date_id.isin(train_df.date_id.unique()[train_index])].copy()
        cv_valid_data = train_df[train_df.date_id.isin(train_df.date_id.unique()[valid_index])].copy()

        cv_y_train = cv_train_data[target_col]
        cv_X_train = cv_train_data[:]
        cv_X_train = cv_X_train.drop(drop_col ,axis = 1)
        cv_y_valid = cv_valid_data[target_col]
        cv_X_valid = cv_valid_data[:]
        cv_X_valid = cv_X_valid.drop(drop_col ,axis = 1)
        model = lgb.LGBMRegressor(**params)
        with timer('feature_training'):
            model.fit(cv_X_train.astype(float), cv_y_train)

        preds = model.predict(cv_X_valid.astype(float))
        preds[preds < 0] = 0
        counter += 1
        train_df.loc[train_df.date_id.isin(train_df.date_id.unique()[valid_index]),"pred_delayTime"] = preds
        models.append(model)      

    y_pred = [model.predict(test_df.drop(drop_col, axis = 1).astype(float)) for model in models]
    y_pred = np.mean(y_pred, axis = 0)
    y_pred[y_pred < 0] = 0
    _df.loc[:train_num, "pred_delayTime"] = train_df.pred_delayTime
    print(len(test_df), len(y_pred))
    _df.loc[train_num:, "pred_delayTime"] = y_pred
    return _df



def feature_preddelayTime(df):
    n = [5]
    feature_cols = [
        ["date"],
        ["date","lineName"]
    ]
    for i in n:
        for c in feature_cols:
            _df = df.copy()
            _df = _df.sort_values(c)
            feature_name = "rolled_" + str(i) + "feature_" + "_".join(c)
            _df[feature_name ]=_df.groupby(c).rolling(i)["pred_delayTime"].mean().reset_index()["pred_delayTime"]
            _df = _df[["id",feature_name ]]
            df = df.merge(_df, on = "id", how = "left")
            
            _df = df.copy()
            _df = _df.sort_values(c)
            feature_name = "rolled_center_" + str(i) + "feature_" + "_".join(c)
            _df[feature_name ]=_df.groupby(c).rolling(i, center = True)["pred_delayTime"].mean().reset_index()["pred_delayTime"]
            _df = _df[["id",feature_name ]]
            df = df.merge(_df, on = "id", how = "left")
    return df

def clf_model_output_feature(_df, train_num):
    train_df = _df[:train_num].copy()
    test_df = _df[train_num:].copy()
    kf = KFold(n_splits = 4, random_state = 14)    
    score = 0
    counter = 1
    models = []
    target_col = "delayTime"
    feature_name = "clf_predict"


    drop_col = ["planArrival", "date_and_time","delayTime","id",
                "is_holiday","date_id","target", "am_pm",
                feature_name,feature_name + "_prob0",feature_name + "_prob1"]
    params ={
        'max_depth':3,
    }
    train_df[ [feature_name,feature_name + "_prob0",feature_name + "_prob1"]] = -1
    test_df[ [feature_name,feature_name + "_prob0",feature_name + "_prob1"]] = -1
    for train_index, valid_index in kf.split(train_df.date_id.unique()):

        cv_train_data = train_df[train_df.date_id.isin(train_df.date_id.unique()[train_index])].copy()
        cv_valid_data = train_df[train_df.date_id.isin(train_df.date_id.unique()[valid_index])].copy()

        cv_y_train = cv_train_data[target_col]
        cv_X_train = cv_train_data[:]
        cv_X_train = cv_X_train.drop(drop_col ,axis = 1)
        cv_y_valid = cv_valid_data[target_col]
        cv_X_valid = cv_valid_data[:]
        cv_X_valid = cv_X_valid.drop(drop_col ,axis = 1)
        
        cv_y_train[cv_y_train > 0] = 1
        cv_y_valid[cv_y_valid > 0] = 1

        model = lgb.LGBMClassifier()
        with timer('feature_training'):
            model.fit(cv_X_train.astype(float), cv_y_train)

        preds = model.predict(cv_X_valid.astype(float))
        print(counter, "acc:", acc(preds, cv_y_valid))
        counter += 1
        train_df.loc[train_df.date_id.isin(train_df.date_id.unique()[valid_index]),feature_name] = preds
        
        preds_proba = model.predict_proba(cv_X_valid.astype(float))
        train_df.loc[train_df.date_id.isin(train_df.date_id.unique()[valid_index]),
                     feature_name + "_prob0"] = preds_proba[:, 0]
        train_df.loc[train_df.date_id.isin(train_df.date_id.unique()[valid_index]),
                     feature_name + "_prob1"] = preds_proba[:, 1]

        #print(preds_proba.shape)

        models.append(model)      

    y_pred = [model.predict(test_df.drop(drop_col, axis = 1).astype(float)) for model in models]
    y_pred = np.mean(y_pred, axis = 0)
    y_pred[y_pred < 0.5] = 0
    y_pred[y_pred >= 0.5] = 1
    _df.loc[:train_num, [feature_name,
                         feature_name + "_prob0",
                         feature_name + "_prob1"]] =train_df[[feature_name,feature_name + "_prob0",feature_name + "_prob1"]]
    _df.loc[train_num:, feature_name] = y_pred
    
    y_pred_proba = np.array([model.predict_proba(test_df.drop(drop_col,
                                                    axis = 1).astype(float)) for model in models])

    y_pred_proba0 = np.mean(y_pred_proba[:,:,0], axis = 0)
    _df.loc[train_num:,  feature_name + "_prob0"] = y_pred_proba0
    y_pred_proba1 = np.mean(y_pred_proba[:,:,1], axis = 0)
    _df.loc[train_num:,  feature_name + "_prob1"] = y_pred_proba1
    return _df

def feature_pre_delay(tmp):
    _tmp = tmp.copy()
    #_tmp.hour = _tmp.hour + 1
    _tmp = _tmp.groupby(["date_id",
                         "lineName",
                         "hour"]
                       ).date_and_time.max().reset_index().rename(
        columns = {"date_and_time":"last_hour_log"})
    
    tmp = tmp.merge(_tmp, how = "left", on = ["date_id", "lineName", "hour"])

    uid = tmp["last_hour_log"].unique()
    
    _tmp = tmp[tmp.date_and_time.isin(uid)]

    _tmp = _tmp.groupby(["date_id","lineName","date_and_time"]
                       ).delayTime.agg(["mean", "sum", "max", "min"]
                                      ).add_prefix("lineName_last_delay_").reset_index()
    _tmp = _tmp.rename(columns = {"date_and_time" : "tmpdate"})
    #tmp = tmp.drop("date_and_time", axis = 1)

    tmp = tmp.merge(_tmp,
    #tmp = tmp.merge(_tmp[["date_id","lineName","date_and_time",output_col_name]],
                    how = "left",
                    left_on = ["date_id", "lineName", "last_hour_log"],
                    right_on = ["date_id", "lineName", "tmpdate"])
    tmp = tmp.drop(["last_hour_log","tmpdate"], axis = 1)

    
    _tmp = tmp.copy()
    _tmp.hour = _tmp.hour +1
    _tmp = _tmp.groupby(["date_id",
                         "trainNo",
                         "hour"]).date_and_time.max().reset_index().rename(
        columns = {"date_and_time":"last_hour_log"})
    tmp = tmp.merge(_tmp, how = "left", on = ["date_id", "trainNo", "hour"])

    uid = tmp["last_hour_log"].unique()
    _tmp = tmp[tmp.date_and_time.isin(uid)]
    _tmp = _tmp.groupby(["date_id","trainNo",
                         "date_and_time"]).delayTime.agg(["mean", "sum", "max", "min"]
                                      ).add_prefix("trainNo_last_delay_").reset_index()
    #.mean().reset_index().rename(columns = {"delayTime":output_col_name})
    #tmp = tmp.drop("date_and_time", axis = 1)
    _tmp = _tmp.rename(columns = {"date_and_time" : "tmpdate"})

    tmp = tmp.merge(_tmp,
                    how = "left",
                    left_on = ["date_id", "trainNo", "last_hour_log"],
                    right_on = ["date_id", "trainNo", "tmpdate"])
    
    tmp = tmp.drop(["last_hour_log","tmpdate"], axis = 1)

    _tmp = tmp.copy()
    _tmp.hour = _tmp.hour +1
    _tmp = _tmp.groupby(["date_id",
                         "stopStation",
                         "hour"]).date_and_time.max().reset_index(
    ).rename(columns = {"date_and_time":"last_hour_log"})
    tmp = tmp.merge(_tmp, how = "left", on = ["date_id", "stopStation", "hour"])

    uid = tmp["last_hour_log"].unique()
    _tmp = tmp[tmp.date_and_time.isin(uid)]
    _tmp = _tmp.groupby(["date_id","stopStation",
                         "date_and_time"]).delayTime.agg(["mean", "sum", "max", "min"]
                                                        ).add_prefix("stopStation_last_delay_").reset_index()
    #tmp = tmp.drop("date_and_time", axis = 1)
    _tmp = _tmp.rename(columns = {"date_and_time" : "tmpdate"})

    tmp = tmp.merge(_tmp,
                    how = "left",
                    left_on = ["date_id", "stopStation", "last_hour_log"],
                    right_on = ["date_id", "stopStation", "tmpdate"])
    tmp = tmp.drop(["last_hour_log","tmpdate"], axis = 1)

    return tmp

def preprocess(train, test, info, network):
    train_num = len(train)
    df = pd.concat([train, test])
    with timer('Label Encoding'):
        df, info = LabelEncoders(df, info)
    with timer('time feature'):
        df, info = time_feature(df, info)
    with timer('cse combined'):
        df, info = cse_feature(df, info)
    with timer("count encoding"):
        df = feature_value_counts(df)
    df = set_id(df)
    with timer("tgt1"):
        df = target_feature(df)
    with timer("tgt2"):
        a = 0
        #df = target_feature2(df, len(train))
    with timer("pre_info"):
        df = feature_pre_delay(df)
        
    with timer("learned_feature"):
        #df = clf_model_output_feature(df, len(train))
        a = 0
        #df = model_output_feature(df, len(train))
        #df = feature_preddelayTime(df)
    return df[:train_num], df[train_num:], info

train, test, info, network = data_loader()
train, test, info = preprocess(train, test, info, network)
print("finish")

[Label Encoding] start
[Label Encoding] done in 4 s
[time feature] start
[time feature] done in 25 s
[cse combined] start
[cse combined] done in 34 s
[count encoding] start
[count encoding] done in 1 s
[tgt1] start
[tgt1] done in 4 s
[tgt2] start
[tgt2] done in 0 s
[pre_info] start
[pre_info] done in 27 s
[learned_feature] start
[learned_feature] done in 0 s
finish


In [4]:
params ={
            'n_estimators':200,#2000
                     'max_depth':3,#8, 16
                     'num_leaves':30,
                     'learning_rate':0.05}
params = {
    #"metrics":""
    'num_leaves':100,
    "max_depth" : 10,
    "objective" : "regression_l1"

    #"objective" : "huber"
}

In [5]:
#train["diff_delay"] = train.delayTime - train.lineName_last_delay_mean


In [6]:
n_splits = 5
kf = KFold(n_splits = n_splits, random_state = 14)    
#kf = MultilabelStratifiedKFold(n_splits = 4, random_state = 14)
#kf = StratifiedKFold(n_splits = n_splits, random_state = 14)
score = 0
counter = 1
models = []
target_col = "delayTime"
eval_col = "delayTime"
#target_col = "diff_delay"

#drop_col = ["planArrival", "date_and_time","delayTime","id","is_holiday","date_id","target", "am_pm", "pred_delayTime"]
drop_col = ["planArrival", "date_and_time","delayTime","id","is_holiday",
            "date_id","target", "am_pm","minute","minute_10","planMinute","date",
            #"diff_delay",
            #"last_hour_log",
            #'clf_predict', 'clf_predict_prob0', 'clf_predict_prob1',
           ]


for train_index, valid_index in kf.split(train.date_id.unique()):
#for train_index, valid_index in kf.split(train.date_id.unique(), train.date_id.nique()):

    cv_train_data = train[train.date_id.isin(train.date_id.unique()[train_index])].copy()
    cv_valid_data = train[train.date_id.isin(train.date_id.unique()[valid_index])].copy()

    cv_train_data = cv_train_data[cv_train_data.hour.isin([8,9,18,19])]
    cv_valid_data = cv_valid_data[cv_valid_data.hour.isin([8,9,18,19])]

    cv_y_valid_evalcol = cv_valid_data[eval_col]
    
    cv_y_train = cv_train_data[target_col]
    cv_X_train = cv_train_data[:]
    cv_X_train = cv_X_train.drop(drop_col ,axis = 1)
    cv_y_valid = cv_valid_data[target_col]
    cv_X_valid = cv_valid_data[:]
    cv_X_valid = cv_X_valid.drop(drop_col ,axis = 1)
    

    model = lgb.LGBMRegressor(**params)
    #model = lgb.LGBMRegressor()
    with timer('training'):
        #rint(len(cv_X_train))
        #print(train.date_id.unique()[train_index])
        model.fit(cv_X_train.astype(float), cv_y_train)
            
    preds = model.predict(cv_X_valid.astype(float))
    #evaluation
    print(np.mean(preds))

    preds[preds < 0] = 0
    
    acc_score = mae(cv_y_valid,preds)
    #acc_score = mae(cv_y_valid_evalcol ,preds + cv_X_valid.lastLineNameDelay)

    print(counter,"fold score is :",acc_score)
    score += acc_score
    counter += 1
    
    models.append(model)
print("average:", score/n_splits)

[training] start
[training] done in 4 s
1.202885135593169
1 fold score is : 0.8647431553257428
[training] start
[training] done in 5 s
1.6925833453520636
2 fold score is : 1.177060120122127
[training] start
[training] done in 5 s
1.5222319955678
3 fold score is : 1.3292935244002397
[training] start
[training] done in 5 s
1.0650516662153466
4 fold score is : 0.9098982524095796
[training] start
[training] done in 5 s
0.7790318115318219
5 fold score is : 0.6666878670278017
average: 0.9895365838570982


In [7]:
print("baseline",score/n_splits)
importance = pd.DataFrame(models[0].booster_.feature_importance(importance_type='gain'), index=cv_X_train.columns, columns = ["f"])
print(importance.sort_values("f", ascending = False).head(30))
print(importance.sort_values("f", ascending = False).tail(5))

baseline 0.9895365838570982
                                               f
trainNo_last_delay_mean            556092.145243
mean_suf1hour_direction_delayTime  201634.276176
mean_suf1hour_delayTime            116022.068295
lineName_last_delay_mean           114235.352562
cnt_date_trainNo_hour               98461.315535
cnt_date_lineName                   76967.275871
lineName_last_delay_sum             57965.007769
cnt_date                            55750.808960
cnt_date_hour_stopStation           51046.362083
cnt_trainNo                         48134.565344
cnt_date_stopStation                47108.680132
directionCode                       42684.188866
cnt_stopStation                     41171.600994
dayofweek                           35235.516294
cnt_date_trainNo                    35006.168430
lineName_last_delay_max             33568.535755
hour                                30346.394386
lineName                            29597.278408
trainNo                             18504

In [8]:
test[test.target == 1].head(5).T

Unnamed: 0,1488935,1488936,1488937,1488938,1488939
id,1488935,1488936,1488937,1488938,1488939
date,20191201,20191201,20191201,20191201,20191201
lineName,0,0,0,0,0
directionCode,1,1,1,1,1
trainNo,177,177,177,177,177
stopStation,61,34,51,39,10
planArrival,18:02,18:04,18:06,18:08,18:10
delayTime,,,,,
target,1,1,1,1,1
stop_and_direction,121,67,101,77,20


In [9]:
y_pred = [model.predict(test.drop(drop_col, axis = 1).astype(float)) for model in models]
y_pred = np.mean(y_pred, axis = 0)
y_pred[y_pred < 0] = 0


In [10]:
#y_pred =  y_pred * test.clf_predict
y_pred = y_pred[test.target == 1]
pd.DataFrame({"id":test[test.target == 1].id, target_col: y_pred }).to_csv("submission.csv", index=False, header = False)
    # pd.DataFrame({"id": range(len(pred_test)), target_col: pred_test }).to_csv("submission.csv", index=False)


In [11]:
y_pred

array([2.31659508, 1.95498866, 2.10939284, ..., 0.        , 0.        ,
       0.        ])

In [12]:
np.mean(y_pred)

0.6141207992048665

In [13]:
train.delayTime.mean()

1.2113521192032963

In [16]:
train[target_col].value_counts()

0.0      1170544
1.0        79024
2.0        50186
3.0        36197
4.0        28288
          ...   
96.0           4
98.0           2
104.0          2
111.0          1
120.0          1
Name: delayTime, Length: 103, dtype: int64

In [21]:
print(sum(cv_X_train.trainNo_last_delay_mean.isna()), len(cv_X_train))

265356 646197


In [16]:
test

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,target,stop_and_direction,...,lineName_last_delay_max,lineName_last_delay_min,trainNo_last_delay_mean,trainNo_last_delay_sum,trainNo_last_delay_max,trainNo_last_delay_min,stopStation_last_delay_mean,stopStation_last_delay_sum,stopStation_last_delay_max,stopStation_last_delay_min
1488885,1488885,20191201,0,1,55,37,17:32,0.0,0.0,73,...,8.0,0.0,,,,,,,,
1488886,1488886,20191201,0,1,55,53,17:34,0.0,0.0,105,...,8.0,0.0,,,,,,,,
1488887,1488887,20191201,0,1,55,22,17:37,0.0,0.0,43,...,8.0,0.0,,,,,,,,
1488888,1488888,20191201,0,1,55,44,17:39,0.0,0.0,87,...,8.0,0.0,,,,,,,,
1488889,1488889,20191201,0,1,55,72,17:41,0.0,0.0,143,...,8.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748109,2748109,20201021,3,2,2969,3,18:47,0.0,0.0,7,...,5.0,0.0,,,,,,,,
2748110,2748110,20201021,3,2,2969,2,18:51,0.0,0.0,5,...,5.0,0.0,,,,,,,,
2748111,2748111,20201021,3,2,2969,95,18:53,0.0,0.0,190,...,5.0,0.0,,,,,,,,
2748112,2748112,20201021,3,2,2969,101,18:59,0.0,0.0,202,...,5.0,0.0,,,,,,,,


In [15]:
def get_resample_index(y_train, rate = 0.1):
    data_num = len(y_train)
    many_class_num = sum(y_train == 0)
    few_class_num = data_num - many_class_num
    print(data_num, many_class_num, few_class_num)
get_resample_index(cv_y_train)

586188 429271 156917


In [47]:
test.groupby("date_id").target.value_counts().sort_values()

date_id  target
96       1.0         596
72       1.0         651
118      1.0         671
22       1.0         708
10       1.0         784
                   ...  
82       0.0       14382
192      0.0       14382
131      0.0       14386
62       0.0       14388
153      0.0       14443
Name: target, Length: 196, dtype: int64

In [154]:
importance = pd.DataFrame(models[0].booster_.feature_importance(importance_type='gain'), index=cv_X_train.columns, columns = ["f"])
print(importance.sort_values("f", ascending = False).head(15))
print(importance.sort_values("f", ascending = False).tail(15))

                                              f
mean_suf1hour_delayTime            1.892167e+07
mean_suf1hour_direction_delayTime  1.679876e+07
tgt_DLH                            1.108608e+07
cnt_trainNo                        4.420083e+06
hour                               4.160205e+06
cnt_date_trainNo                   3.335105e+06
cnt_date_lineName                  3.174944e+06
cnt_date_trainNo_hour              2.938999e+06
cnt_date                           2.457114e+06
cnt_stopStation                    1.974337e+06
lineName                           1.716166e+06
trainNo                            1.461470e+06
cnt_date_hour_stopStation          1.120446e+06
dayofweek                          1.089709e+06
cnt_date_stopStation               1.076011e+06
                                            f
cnt_date_trainNo_hour            2.938999e+06
cnt_date                         2.457114e+06
cnt_stopStation                  1.974337e+06
lineName                         1.716166e+06
tr

In [34]:
dates =  pd.to_datetime(train.date.unique(), format='%Y%m%d').dayofweek
dates.value_counts()

2    16
0    16
3    15
1    13
4    12
5     7
6     4
dtype: int64

In [102]:
_calc_df = train.groupby(["dayofweek","lineName","hour"]).delayTime.mean().reset_index()


Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,target,stop_and_direction,...,cse_21,cse_22,date_id,mean_pre1hour_delayTime,mean_suf1hour_delayTime,mean_pre1hour_direction_delayTime,mean_suf1hour_direction_delayTime,diff_pre_suf,diff_pre_suf_direction,tgt_DLH
0,0,20191204,0,1,11,37,06:11,0.0,,73,...,,,4,0.304983,,0.102249,,,,0.067941
1,1,20191204,0,1,11,53,06:13,0.0,,105,...,,,4,0.304983,,0.102249,,,,0.061702
2,2,20191204,0,1,11,22,06:16,0.0,,43,...,,,4,0.304983,,0.102249,,,,0.061702
3,3,20191204,0,1,11,44,06:18,0.0,,87,...,,,4,0.304983,,0.102249,,,,0.067941
4,4,20191204,0,1,11,72,06:20,0.0,,143,...,,,4,0.304983,,0.102249,,,,0.065076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,20191204,1,1,1003,104,19:43,0.0,,207,...,,,5,0.003012,0.246667,0.004040,0.350801,0.243655,0.34676,0.011561
9996,9996,20191204,1,1,1003,41,19:46,0.0,,81,...,,,5,0.003012,0.246667,0.004040,0.350801,0.243655,0.34676,0.023529
9997,9997,20191204,1,1,1003,20,19:48,0.0,,39,...,,,5,0.003012,0.246667,0.004040,0.350801,0.243655,0.34676,0.011429
9998,9998,20191204,1,1,1003,36,19:50,0.0,,71,...,,,5,0.003012,0.246667,0.004040,0.350801,0.243655,0.34676,0.011429


In [24]:
test[test.date_id == 0]

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,target,stop_and_direction,...,lineName_last_delay_max,lineName_last_delay_min,trainNo_last_delay_mean,trainNo_last_delay_sum,trainNo_last_delay_max,trainNo_last_delay_min,stopStation_last_delay_mean,stopStation_last_delay_sum,stopStation_last_delay_max,stopStation_last_delay_min
1488885,1488885,20191201,0,1,55,37,17:32,0.0,0.0,73,...,8.0,0.0,,,,,,,,
1488886,1488886,20191201,0,1,55,53,17:34,0.0,0.0,105,...,8.0,0.0,,,,,,,,
1488887,1488887,20191201,0,1,55,22,17:37,0.0,0.0,43,...,8.0,0.0,,,,,,,,
1488888,1488888,20191201,0,1,55,44,17:39,0.0,0.0,87,...,8.0,0.0,,,,,,,,
1488889,1488889,20191201,0,1,55,72,17:41,0.0,0.0,143,...,8.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1501476,1501476,20191201,3,2,2969,3,18:36,,0.0,7,...,,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1501477,1501477,20191201,3,2,2969,2,18:40,,0.0,5,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501478,1501478,20191201,3,2,2969,95,18:43,,0.0,190,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501479,1501479,20191201,3,2,2969,101,18:48,,0.0,202,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
train[train.date_id == 2]

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,target,stop_and_direction,...,cnt_date_stopStation,cnt_date_hour_stopStation,date_id,mean_suf1hour_delayTime,mean_suf1hour_direction_delayTime,lastLineNameDelay,lastTrainNoDelay,date_and_time,lastStationDelay,diff_delay


In [118]:
t.stopStation_last_delay_max.value_counts()

0.0      1162465
1.0       110990
2.0        74819
3.0        54495
4.0        43386
          ...   
93.0          16
101.0         15
91.0          13
120.0         11
98.0           9
Name: stopStation_last_delay_max, Length: 97, dtype: int64

In [117]:
t.columns

Index(['id', 'date', 'lineName', 'directionCode', 'trainNo', 'stopStation',
       'planArrival', 'delayTime', 'target', 'stop_and_direction',
       'date_and_time', 'hour', 'minute', 'minute_10', 'dayofweek', 'am_pm',
       'is_holiday', 'planHour', 'planMinute', 'cse_12', 'cse_13', 'cse_15',
       'cse_16', 'cse_18', 'cse_20', 'cse_21', 'cse_12_abs', 'cse_13_abs',
       'cse_15_abs', 'cse_16_abs', 'cse_18_abs', 'cse_20_abs', 'cse_21_abs',
       'cnt_date_lineName', 'cnt_date_trainNo',
       'cnt_date_trainNo_directionCode', 'cnt_date_trainNo_hour',
       'cnt_date_trainNo_hour_minute_10', 'cnt_date_trainNo_am_pm',
       'cnt_trainNo', 'cnt_date', 'cnt_stopStation', 'cnt_date_stopStation',
       'cnt_date_hour_stopStation', 'date_id', 'mean_suf1hour_delayTime',
       'mean_suf1hour_direction_delayTime', 'lineName_last_delay_mean',
       'lineName_last_delay_sum', 'lineName_last_delay_max',
       'lineName_last_delay_min', 'trainNo_last_delay_mean',
       'trainNo_last_del

In [81]:
len(t[t.lastTrainNoDelay.isna()])

2197713

In [70]:
len(t)

2748114

In [110]:
pd.concat([train, test]).groupby(["date_id",
                         "lineName",
                         "hour"]
                       ).date_and_time.max()

date_id  lineName  hour
0        0         17                     NaT
                   18                     NaT
                   19                     NaT
                   20     2019-12-01 17:59:00
         1         17                     NaT
                                  ...        
194      2         22     2020-10-22 19:59:00
         3         17                     NaT
                   18                     NaT
                   19                     NaT
                   20     2020-10-22 17:59:00
Name: date_and_time, Length: 3494, dtype: datetime64[ns]

In [112]:
train.date_and_time

0         NaT
1         NaT
2         NaT
3         NaT
4         NaT
           ..
1488880   NaT
1488881   NaT
1488882   NaT
1488883   NaT
1488884   NaT
Name: date_and_time, Length: 1488885, dtype: datetime64[ns]

In [59]:
train.columns

Index(['id', 'date', 'lineName', 'directionCode', 'trainNo', 'stopStation',
       'planArrival', 'delayTime', 'target', 'stop_and_direction', 'hour',
       'minute', 'minute_10', 'dayofweek', 'am_pm', 'is_holiday', 'planHour',
       'planMinute', 'cse_12', 'cse_13', 'cse_15', 'cse_16', 'cse_18',
       'cse_20', 'cse_21', 'cse_12_abs', 'cse_13_abs', 'cse_15_abs',
       'cse_16_abs', 'cse_18_abs', 'cse_20_abs', 'cse_21_abs',
       'cnt_date_lineName', 'cnt_date_trainNo',
       'cnt_date_trainNo_directionCode', 'cnt_date_trainNo_hour',
       'cnt_date_trainNo_hour_minute_10', 'cnt_date_trainNo_am_pm',
       'cnt_trainNo', 'cnt_date', 'cnt_stopStation', 'cnt_date_stopStation',
       'cnt_date_hour_stopStation', 'date_id', 'mean_suf1hour_delayTime',
       'mean_suf1hour_direction_delayTime', 'lastLineNameDelay',
       'lastTrainNoDelay', 'date_and_time', 'lastStationDelay', 'diff_delay'],
      dtype='object')

In [61]:
train.lastLineNameDelay.value_counts()

0.000000     483881
1.000000      22326
2.000000      16641
0.133333      15908
0.030303      15836
              ...  
8.000000         34
23.000000        30
18.000000        29
8.500000         29
13.000000        26
Name: lastLineNameDelay, Length: 433, dtype: int64

In [62]:
test.lastLineNameDelay.value_counts()

0.000000     130443
0.045455       6607
0.090909       6014
0.080000       5204
0.400000       4219
              ...  
7.200000        590
13.900000       574
17.750000       392
16.666667       342
41.916667       266
Name: lastLineNameDelay, Length: 203, dtype: int64

In [133]:
#baseline
tst = pd.read_csv("JR/test.csv")
ss = pd.read_csv("JR/sample_submit.csv", usecols=[0], names=["id"])
tst["delayTime"] = tst.groupby(["date", "trainNo"]).delayTime.transform(
    lambda x: x.fillna(method="ffill").fillna(0)
)
submit = ss.merge(tst[["id", "delayTime"]], on="id", how="left")


In [138]:
tst[tst.target == 1].delayTime.mean()

0.6901155952139525

In [153]:
tst[tst.target == 1].delayTime

50         3.0
51         3.0
52         3.0
53         3.0
54         3.0
          ... 
1259125    0.0
1259126    0.0
1259127    0.0
1259128    0.0
1259228    0.0
Name: delayTime, Length: 123275, dtype: float64