In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df["KFold"] = -1
train_df

Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,KFold
0,0,20191204,A,1,AFGKvJ,Rlfq,06:11,0.0,-1
1,1,20191204,A,1,AFGKvJ,coZB,06:13,0.0,-1
2,2,20191204,A,1,AFGKvJ,LMww,06:16,0.0,-1
3,3,20191204,A,1,AFGKvJ,VNyR,06:18,0.0,-1
4,4,20191204,A,1,AFGKvJ,jhlV,06:20,0.0,-1
...,...,...,...,...,...,...,...,...,...
1488880,1488880,20201022,D,2,zbKwea,BCRD,18:47,0.0,-1
1488881,1488881,20201022,D,2,zbKwea,AVjc,18:51,1.0,-1
1488882,1488882,20201022,D,2,zbKwea,uYlv,18:53,0.0,-1
1488883,1488883,20201022,D,2,zbKwea,wwYD,18:59,0.0,-1


In [2]:
train_date = train_df.groupby("date").count()[["id"]].sort_index().reset_index()
test_date = test_df.groupby("date").count()[["id"]].sort_index().reset_index()

date_df = train_date.merge(test_date, on="date", how="outer" , suffixes=["_train", "_test"]).sort_values("date")

def func(x):
    is_train = x["id_train"] == x["id_train"]
    is_test = x["id_test"] == x["id_test"]
    is_train_test = is_train and is_test
    if is_train_test:
        return "train_test"
    elif is_train:
        return "train"
    elif is_test:
        return "test"
    else:
        assert("error")

date_df["appearance"] = date_df.apply(func, axis=1)
date_df

Unnamed: 0,date,id_train,id_test,appearance
83,20191201,,12596.0,test
84,20191202,,24606.0,test
85,20191203,,13307.0,test
0,20191204,31756.0,,train
1,20191205,31679.0,,train
2,20191206,31654.0,,train
86,20191207,,12576.0,test
87,20191209,,24857.0,test
88,20191210,,28787.0,test
3,20191211,15108.0,13577.0,train_test


In [3]:
from sklearn.model_selection import KFold

date_df_tr = date_df[date_df["appearance"] == "train"]
date_df_tr = date_df_tr.set_index("date")

kf = KFold(n_splits=5, random_state=0, shuffle=True)

date_df_tr["KFold"] = -1
for fold, (tr_ind, val_ind) in enumerate(kf.split(date_df_tr)):
    print(date_df_tr.iloc[val_ind].index.to_list())
    val_date = date_df_tr.iloc[val_ind].index.to_list()
    train_index = train_df[train_df["date"].isin(val_date)].index
    train_df.loc[train_index, "KFold"] = fold
    


train_df

[20191206, 20191214, 20200118, 20200306, 20200318, 20200414, 20200426, 20200818, 20200907, 20200918, 20201022]
[20200106, 20200116, 20200123, 20200226, 20200310, 20200314, 20200326, 20200622, 20200630, 20200707, 20200928]
[20191222, 20200111, 20200125, 20200129, 20200212, 20200219, 20200316, 20200508, 20200914, 20201005, 20201016]
[20191205, 20191231, 20200119, 20200122, 20200210, 20200227, 20200304, 20200305, 20200711, 20200801, 20200831]
[20191204, 20191212, 20200112, 20200217, 20200224, 20200624, 20200709, 20200903, 20200916, 20201002]


Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,KFold
0,0,20191204,A,1,AFGKvJ,Rlfq,06:11,0.0,4
1,1,20191204,A,1,AFGKvJ,coZB,06:13,0.0,4
2,2,20191204,A,1,AFGKvJ,LMww,06:16,0.0,4
3,3,20191204,A,1,AFGKvJ,VNyR,06:18,0.0,4
4,4,20191204,A,1,AFGKvJ,jhlV,06:20,0.0,4
...,...,...,...,...,...,...,...,...,...
1488880,1488880,20201022,D,2,zbKwea,BCRD,18:47,0.0,0
1488881,1488881,20201022,D,2,zbKwea,AVjc,18:51,1.0,0
1488882,1488882,20201022,D,2,zbKwea,uYlv,18:53,0.0,0
1488883,1488883,20201022,D,2,zbKwea,wwYD,18:59,0.0,0


In [4]:
date_df_tr_te = date_df[date_df["appearance"] == "train_test"]
print(train_df[train_df["KFold"] == -1].shape)
print(train_df[train_df["date"].isin(date_df_tr_te["date"].to_list())].shape)
assert train_df[train_df["KFold"] == -1].shape == train_df[train_df["date"].isin(date_df_tr_te["date"].to_list())].shape

(455288, 9)
(455288, 9)


In [7]:
date_df_tr_te = date_df[date_df["appearance"] == "train_test"]
for date in date_df_tr_te["date"]:
    print(date)
    trainNo_df = train_df[train_df["date"] == date].groupby("trainNo").count()[["id"]]
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
    for fold, (tr_ind, val_ind) in enumerate(kf.split(trainNo_df)):
        val_trainNo = trainNo_df.iloc[val_ind].index.to_list()
        train_index = train_df[(train_df["date"] == date)&(train_df["trainNo"].isin(val_trainNo))].index
        if train_df.loc[train_index]["KFold"].nunique() != 1:
            raise ValueError("error!")
        train_df.loc[train_index, "KFold"] = fold


20191211
20191213
20191216
20191217
20191220
20191225
20200107
20200108
20200109
20200110
20200114
20200115
20200120
20200121
20200127
20200128
20200130
20200131
20200203
20200204
20200205
20200206
20200213
20200214
20200221
20200528
20200629
20200826
20201013


Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,KFold


Unnamed: 0,id,date,lineName,directionCode,trainNo,stopStation,planArrival,delayTime,KFold
0,0,20191204,A,1,AFGKvJ,Rlfq,06:11,0.0,4
1,1,20191204,A,1,AFGKvJ,coZB,06:13,0.0,4
2,2,20191204,A,1,AFGKvJ,LMww,06:16,0.0,4
3,3,20191204,A,1,AFGKvJ,VNyR,06:18,0.0,4
4,4,20191204,A,1,AFGKvJ,jhlV,06:20,0.0,4
5,5,20191204,A,1,AFGKvJ,efzW,06:22,0.0,4
6,6,20191204,A,1,AFGKvJ,PcxI,06:25,0.0,4
7,7,20191204,A,1,AFGKvJ,ejfb,06:27,0.0,4
8,8,20191204,A,1,AFGKvJ,RDLf,06:28,0.0,4
9,9,20191204,A,1,AFGKvJ,cRgf,06:31,0.0,4
