In [1]:
import pickle
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import pickle
import pandas as pd
from data.PurgedGroupTimeSeriesSplit import \
    PurgedGroupTimeSeriesSplit

In [2]:
scores = dict()
df = pd.read_csv('input/train.csv')

print('data loaded...')

data loaded...


In [3]:
from typing import Tuple
import numpy as np

class GroupTimeSeriesSplit:
    """
    Custom class to create a Group Time Series Split. We ensure
    that the time id values that are in the testing data are not a part
    of the training data & the splits are temporal
    """
    def __init__(self, n_folds: int, holdout_size: int, groups: str) -> None:
        self.n_folds = n_folds
        self.holdout_size = holdout_size
        self.groups = groups

    def split(self, X) -> Tuple[np.array, np.array]:
        # Take the group column and get the unique values
        unique_time_ids = np.unique(self.groups.values)

        # Split the time ids into the length of the holdout size
        # and reverse so we work backwards in time. Also, makes
        # it easier to get the correct time_id values per
        # split
        array_split_time_ids = np.array_split(
            unique_time_ids, len(unique_time_ids) // self.holdout_size
        )[::-1]

        # Get the first n_folds values
        array_split_time_ids = array_split_time_ids[:self.n_folds]

        for time_ids in array_split_time_ids:
            # Get test index - time id values that are in the time_ids
            test_condition = X['time_id'].isin(time_ids)
            test_index = X.loc[test_condition].index

            # Get train index - The train index will be the time
            # id values right up until the minimum value in the test
            # data - we can also add a gap to this step by
            # time id < (min - gap)
            train_condition = X['time_id'] < (np.min(time_ids))
            train_index = X.loc[train_condition].index

            yield train_index, test_index

In [13]:
gtss = GroupTimeSeriesSplit(n_folds=5, holdout_size=150, groups=df['time_id'])
for fold, (tr, val) in enumerate(gtss.split(df)):
    print('Sum: ', len(tr) + len(val))
    print('Frac: ', len(val)/len(tr))
    print('FOLD:', fold)
    print('Train:')
    print('Shape:', tr.shape)
    print(np.min(df.iloc[tr].time_id), '->', np.max(df.iloc[tr].time_id))
    print()

    print('Val:')
    print('Shape:', val.shape)
    print(np.min(df.iloc[val].time_id), '->', np.max(df.iloc[val].time_id))
    print()

Sum:  3141410
Frac:  0.18670773703480403
FOLD: 0
Train:
Shape: (2647164,)
0 -> 1068

Val:
Shape: (494246,)
1069 -> 1219

Sum:  2647164
Frac:  0.21583305124875818
FOLD: 1
Train:
Shape: (2177243,)
0 -> 917

Val:
Shape: (469921,)
918 -> 1068

Sum:  2177243
Frac:  0.2519474321266643
FOLD: 2
Train:
Shape: (1739085,)
0 -> 766

Val:
Shape: (438158,)
767 -> 917

Sum:  1739085
Frac:  0.29844865741445886
FOLD: 3
Train:
Shape: (1339356,)
0 -> 615

Val:
Shape: (399729,)
616 -> 766

Sum:  1339356
Frac:  0.3638410381580515
FOLD: 4
Train:
Shape: (982047,)
0 -> 464

Val:
Shape: (357309,)
465 -> 615



1211 time ids