In [52]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import GroupKFold

In [66]:
SAVE_DIR        = '.cache/'
TRAIN_CSV       = './train.csv'
IGNORE_CLASSES  = [14]

In [42]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,2332,2580
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,2954,3159
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,2540,3072


In [41]:
def split_df_Kfold(df: pd.DataFrame, ignore_classes: list = IGNORE_CLASSES) -> pd.DataFrame:
    '''
    
    '''
    df = df.copy()

    for ignore_c in ignore_classes:
        df = df.loc[df.class_id != ignore_c]

    df = df.reset_index()

    gkf = GroupKFold(n_splits=5)
    df["fold"] = -1
    for fold, (train_idx, val_idx) in enumerate(
        gkf.split(df, groups=df.image_id.tolist())
    ):
        df.loc[val_idx, "fold"] = fold

    return df


In [60]:
def get_train_val(df: pd.DataFrame, fold: int = 0) -> tuple:
    '''
    '''
    assert fold in list(df.fold.unique()), "fold index not available"
    train_files = []
    val_files   = []
    val_files += list(df[df.fold==fold].image_id.unique())
    train_files += list(df[df.fold!=fold].image_id.unique())

    return train_files, val_files

def save_split_df(df : pd.DataFrame, save_dir = '.cache') -> None:
    try:
        os.makedirs(os.path.join(save_dir, 'train'))
        os.makedirs(os.path.join(save_dir, 'val'))
    except:
        pass
    for fold in list(df.fold.unique()):
        train, val = get_train_val(df = df, fold = fold)
        train_df = df.loc[df.image_id.isin(train)]
        val_df = df.loc[df.image_id.isin(val)]
        train_df.to_csv(os.path.join(save_dir,'train', f'{fold}.csv'), index=False)
        val_df.to_csv(os.path.join(save_dir,'val', f'{fold}.csv'), index=False)

In [61]:
splited_df = split_df_Kfold(train_df)
splited_df.head()

Unnamed: 0,index,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,fold
0,2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336,3
1,3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880,2
2,5,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,2540,3072,4
3,6,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,2285,2555,2
4,7,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,2568,3353,4


In [67]:
save_split_df(df = splited_df, save_dir = SAVE_DIR)
pd.read_csv(f'{SAVE_DIR}train/1.csv').head(), pd.read_csv(f'{SAVE_DIR}val/1.csv').head()

(   index                          image_id          class_name  class_id  \
 0      2  9a5094b2563a1ef3ff50dc5c7ff71345        Cardiomegaly         3   
 1      3  051132a778e61a86eb147c7c6f564dfe  Aortic enlargement         0   
 2      5  1c32170b4af4ce1a3030eb8167753b06  Pleural thickening        11   
 3      6  0c7a38f293d5f5e4846aa4ca6db4daf1                 ILD         5   
 4      7  47ed17dcb2cbeec15182ed335a8b5a9e         Nodule/Mass         8   
 
   rad_id   x_min   y_min   x_max   y_max  width  height  fold  
 0    R10   691.0  1375.0  1653.0  1831.0   2080    2336     3  
 1    R10  1264.0   743.0  1611.0  1019.0   2304    2880     2  
 2     R9   627.0   357.0   947.0   433.0   2540    3072     4  
 3    R17  1347.0   245.0  2188.0  2169.0   2285    2555     2  
 4     R9   557.0  2352.0   675.0  2484.0   2568    3353     4  ,
    index                          image_id          class_name  class_id  \
 0     10  7c1add6833d5f0102b0d3619a1682a64        Lung Opacity     