In [26]:
import os.path as osp
import os
import pickle
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import GroupKFold

from itertools import combinations
from imblearn.over_sampling import RandomOverSampler

In [27]:
X = pd.read_csv('development_status_label.csv')
y = X[['label_pid']].astype('int')
groups = X[['ID']]

In [28]:
# split all data into 10 segment
gkf = GroupKFold(n_splits=10)

one_tenth = []
nine_tenth = []

for i, (nine_idx, one_idx) in enumerate(gkf.split(X, y, groups)):
    one_tenth.append(one_idx)
    nine_tenth.append(nine_idx)

In [29]:
# one_tenth contains 10 segments of 1/10 size
# and each of them has non-overlapping pids
print(one_tenth)

[array([  0,   2,  12,  23,  41,  57,  67,  70,  74,  84,  93, 122, 129,
       132, 140]), array([  1,  11,  24,  40,  43,  56,  66,  73,  83,  94, 112, 128, 138,
       139]), array([ 10,  15,  25,  39,  49,  55,  65,  72,  82,  95, 111, 114, 127,
       137]), array([  9,  16,  26,  38,  48,  54,  64,  81,  86,  96, 110, 120, 126,
       136]), array([  7,  17,  27,  35,  37,  47,  63,  80,  87,  97, 109, 119, 125,
       135]), array([ 18,  28,  32,  34,  36,  46,  62,  78,  88,  98, 106, 108, 118,
       134]), array([  6,  19,  29,  33,  45,  50,  60,  89,  99, 103, 105, 107, 117,
       133]), array([  5,   8,  20,  30,  44,  52,  53,  77,  90, 100, 104, 116, 121,
       131]), array([  4,  14,  21,  31,  42,  59,  69,  76,  79,  91, 101, 115, 123,
       124]), array([  3,  13,  22,  51,  58,  61,  68,  71,  75,  85,  92, 102, 113,
       130])]


In [30]:
# nine_tenth contains 10 segments of 9/10 size         
# the i-th array of the nine_tenth is the rest of pids that are
# excluded from the i-th array of one_tenth.
# and each of them has non-overlapping pids
print(nine_tenth)
# therefore, we only use one_tenth to assign validation label, 
# and use the rest of them as train.

[array([  1,   3,   4,   5,   6,   7,   8,   9,  10,  11,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  24,  25,  26,  27,  28,  29,
        30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
        58,  59,  60,  61,  62,  63,  64,  65,  66,  68,  69,  71,  72,
        73,  75,  76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,
        88,  89,  90,  91,  92,  94,  95,  96,  97,  98,  99, 100, 101,
       102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
       115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128,
       130, 131, 133, 134, 135, 136, 137, 138, 139]), array([  0,   2,   3,   4,   5,   6,   7,   8,   9,  10,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  41,  42,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  57,
        5

In [31]:
X = X.assign(split="none")

In [32]:
# make first and second split be test data
X.loc[one_tenth[0], 'split'] = "test"
X.loc[one_tenth[1], 'split'] = "test"

In [33]:
test_ids = X[X['split'] == 'test']['ID'].unique()

In [34]:
# 8 segments are left.
one_tenth_left = one_tenth[2:]

In [None]:
cnt = 0
val_ids = []

path = '../01_skeleton_extraction/my_skeleton_labeled/B_4_1/'

for comb in list(combinations(np.arange(8), 2)):
    cnt += 1
    fold_dnx = X.copy()
    val_ids = []
    train_ids = []

    result_train = []
    result_val = []
    result_test = []
    result_all = []

    print("a combination of:", comb[0], "and", comb[1])
    vals = np.hstack([one_tenth[comb[0]], one_tenth[comb[1]]])
    fold_dnx.loc[vals, 'split'] = "val"
    val_ids = fold_dnx[fold_dnx['split'] == 'val']['ID'].unique()
    train_ids = fold_dnx[fold_dnx['split'] == 'notyet']['ID'].unique()

    for d in os.listdir(path):
        if d.endswith('.pkl'):
            with open(osp.join(path, d), 'rb') as f:
                content = pickle.load(f)
            curId = d[:4]
            if curId in test_ids:
                result_test.append([content])
            elif curId in train_ids:
                result_train.append([content])
            elif curId in val_ids:
                result_val.append([content])

    train_labelLi = []
    test_labelLi = []
    val_labelLi = []
    
    for item in result_train:
        train_labelLi.append(int(item[0]['label']))
    for item in result_test:
        test_labelLi.append(int(item[0]['label']))
    for item in result_val:
        val_labelLi.append(int(item[0]['label']))

    oversampler = RandomOverSampler(random_state=0)

    train_oversampled, yttt = oversampler.fit_resample(result_train, train_labelLi)

    print("result_trian:", len(result_train))
    print("result_val:", len(result_val))
    print("result_test:", len(result_test))

    train_final = []
    test_final = []
    val_final = []
    
    for item in train_oversampled:
        train_final.append(item[0])
    for item in result_test:
        test_final.append(item[0])
    for item in result_val:
        val_final.append(item[0])


    print("train_final:", len(train_final))
    print("test_final:", len(test_final))
    print("val_final:", len(val_final))

    with open(f'../01_skeleton_extraction/my_skeleton/B/4/fold_{cnt}_train.pkl', 'wb') as out:
        pickle.dump(train_final, out, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'../01_skeleton_extraction/my_skeleton/B/4/fold_{cnt}_val.pkl', 'wb') as out:
        pickle.dump(val_final, out, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'../01_skeleton_extraction/my_skeleton/B/4/fold_{cnt}_test.pkl', 'wb') as out:
        pickle.dump(test_final, out, protocol=pickle.HIGHEST_PROTOCOL)      
