In [1]:
import numpy as np
import pandas as pd
import random
import os
from sklearn.model_selection import StratifiedKFold

SEED = 2112
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

***
10 different splits for repeated cross validation

In [2]:
train_labels = pd.read_csv("../data/raw/train_labels.csv")
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


In [3]:
#train_labels["customer_id"] = train_labels["customer_ID"].str[-16:].apply(lambda x: int(x, 16))
#assert (train_labels.groupby("customer_ID")["customer_id"].nunique() == 1).all()

In [4]:
super_seeds = [2, 7, 11, 23, 2112, 2222, 3333, 5555, 7777, 11111111]

In [5]:
all_splits = list()

for it,seed in enumerate(super_seeds):
    folds = train_labels[["customer_ID"]].copy()
    folds["fold"] = -1
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    skf_split = skf.split(train_labels, train_labels["target"].values)
    
    for fold,(_,valid_idx) in enumerate(skf_split):
        folds.loc[valid_idx,"fold"] = fold
    
    # assert that all samples are assigned to one fold
    assert len(folds.query("fold < 0")) == 0
    
    print("-"*90)
    print(pd.merge(train_labels, folds).groupby("fold")["target"].value_counts())
    
    all_splits.append(folds)
    folds.to_csv(f"../data/processed/cv{it}.csv", index=False)

------------------------------------------------------------------------------------------
fold  target
0     0         68017
      1         23766
1     0         68017
      1         23766
2     0         68017
      1         23766
3     0         68017
      1         23765
4     0         68017
      1         23765
Name: target, dtype: int64
------------------------------------------------------------------------------------------
fold  target
0     0         68017
      1         23766
1     0         68017
      1         23766
2     0         68017
      1         23766
3     0         68017
      1         23765
4     0         68017
      1         23765
Name: target, dtype: int64
------------------------------------------------------------------------------------------
fold  target
0     0         68017
      1         23766
1     0         68017
      1         23766
2     0         68017
      1         23766
3     0         68017
      1         23765
4     0         68

In [6]:
(all_splits[0].fold == all_splits[1].fold).all()

False

In [7]:
(all_splits[1].fold == all_splits[2].fold).all()

False

In [8]:
(all_splits[2].fold == all_splits[3].fold).all()

False

In [9]:
(all_splits[3].fold == all_splits[4].fold).all()

False

In [10]:
(all_splits[4].fold == all_splits[5].fold).all()

False

In [11]:
(all_splits[5].fold == all_splits[6].fold).all()

False

In [12]:
(all_splits[6].fold == all_splits[7].fold).all()

False

In [13]:
(all_splits[7].fold == all_splits[8].fold).all()

False

In [14]:
(all_splits[8].fold == all_splits[9].fold).all()

False

***