In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
for df in [train_df, test_df]:
    df["date * trainNo"] = df["date"].astype("str") + " * " + df["trainNo"]

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1488885 entries, 0 to 1488884
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   id              1488885 non-null  int64  
 1   date            1488885 non-null  int64  
 2   lineName        1488885 non-null  object 
 3   directionCode   1488885 non-null  int64  
 4   trainNo         1488885 non-null  object 
 5   stopStation     1488885 non-null  object 
 6   planArrival     1488885 non-null  object 
 7   delayTime       1488885 non-null  float64
 8   date * trainNo  1488885 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 102.2+ MB


In [2]:
train_date = train_df.groupby("date").count()[["id"]].sort_index().reset_index()
test_date = test_df.groupby("date").agg({"id":"count", "target":"sum"})

date_df = train_date.merge(test_date, on="date", how="outer" , suffixes=["_train", "_test"]).sort_values("date")

def func(x):
    is_train = x["id_train"] == x["id_train"]
    is_test = x["id_test"] == x["id_test"]
    is_train_test = is_train and is_test
    if is_train_test:
        return "train_test"
    elif is_train:
        return "train"
    elif is_test:
        return "test"
    else:
        assert("error")

date_df["appearance"] = date_df.apply(func, axis=1)
# trainにしかいない日、testにしかいない日、両方にいる日、がランダムに分かれているっぽい
# trainかつtestの日はflagがついてるか確認するべき
# trainかつtestの日がtrainのデータが少ない訳でもない?
# どの線のどの時間帯のどの電車がtrain, testにあるのかは見た方が良さそう
# testのうちだいたい10%くらいにflagが立っていて、それはtrain_testでもtest_onlyでも変わらない
# ので、validの分割は、train_testの日から半分or日単位と、train_onlyの日から日単位で分ける
# cvの切り方を乱数で変えることでcv安定させたい
# testで予測対象になっている日、最初の遅延時間がわかっているものもある
# だいたい10%くらいでdate * trainNoで最初の遅延時刻が得られるものがある

In [3]:
date_df

Unnamed: 0,date,id_train,id_test,target,appearance
83,20191201,,12596.0,809.0,test
84,20191202,,24606.0,2417.0,test
85,20191203,,13307.0,1327.0,test
0,20191204,31756.0,,,train
1,20191205,31679.0,,,train
2,20191206,31654.0,,,train
86,20191207,,12576.0,784.0,test
87,20191209,,24857.0,2546.0,test
88,20191210,,28787.0,2359.0,test
3,20191211,15108.0,13577.0,1447.0,train_test


In [4]:
test_g = test_df.groupby(["date", "trainNo"]).agg({"id":"count", "target":"sum"})
test_g[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,id,target
date,trainNo,Unnamed: 2_level_1,Unnamed: 3_level_1
20191201,ABhZcD,29,12
20191201,ACHbwz,28,0
20191201,ADlAIX,10,0
20191201,AEYehL,4,0
20191201,AMXSxY,10,0
20191201,AQSTZh,14,0
20191201,AWnezd,1,0
20191201,AYCHrl,8,0
20191201,AqTAbk,5,0
20191201,AtkVng,16,0


In [None]:
print(test_df["date * trainNo"].nunique())
test_g["is_firstDelayTime"] = 0
for val in test_df["date * trainNo"].unique():
    temp = test_df[test_df["date * trainNo"] == val]
    tar = temp.iloc[0]["delayTime"]
    if tar == tar: # Nanじゃないなら
        date, trainNo = val.split(" * ")
        date = int(date)
        test_g["is_firstDelayTime"].loc[date, trainNo] = 1
        

In [None]:
print(test_g.shape)
print(test_g.sum())

In [5]:
train_df_d = pd.read_csv("../input/decoded_train.csv", encoding="cp932")
test_df_d = pd.read_csv("../input/decoded_test.csv", encoding="cp932")

In [8]:
train_g = train_df_d.groupby(["date", "trainNo"]).agg({"id":"count"})


Unnamed: 0_level_0,Unnamed: 1_level_0,id
date,trainNo,Unnamed: 2_level_1


In [9]:
train_g[train_g["id"] > 50]

Unnamed: 0_level_0,Unnamed: 1_level_0,id
date,trainNo,Unnamed: 2_level_1
20201013,UFiyVP,62


In [11]:
train_df_d[(train_df_d["date"] == 20201013) & (train_df_d["trainNo"] == "UFiyVP")]

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime
1432921,1432921,20201013,京浜東北線,2,UFiyVP,JK46 さいたま新都心,16:14,66.0
1432922,1432922,20201013,京浜東北線,2,UFiyVP,JK45 与野,16:16,66.0
1432923,1432923,20201013,京浜東北線,2,UFiyVP,JK44 北浦和,16:18,66.0
1432924,1432924,20201013,京浜東北線,2,UFiyVP,JK43 浦和,16:21,67.0
1432925,1432925,20201013,京浜東北線,2,UFiyVP,JY05 上野,16:53,6.0
1432926,1432926,20201013,京浜東北線,2,UFiyVP,JY04 御徒町,16:55,6.0
1432927,1432927,20201013,京浜東北線,2,UFiyVP,JCxx 秋葉原,16:57,5.0
1432928,1432928,20201013,京浜東北線,2,UFiyVP,JC02 神田,16:59,5.0
1432929,1432929,20201013,京浜東北線,2,UFiyVP,JY01 東京,17:01,6.0
1432930,1432930,20201013,京浜東北線,2,UFiyVP,JY30 有楽町,17:03,5.0
