# Data Splits

In [11]:
import os
from glob import glob
import pandas as pd
import numpy as np

REPO_ROOT = "/home/mdorosan/2023/cv-toolkit"
META_PATH = os.path.join(REPO_ROOT, "metadata/kvasir-capsule.csv")
DATA_ROOT = os.path.join(REPO_ROOT, "datasets/kvasir-capsule")

In [12]:
# update with experiment name
EXP_PATH = os.path.join(
    REPO_ROOT,
    "tutorials/tensorflow_notebooks/classification/test_runs/kvasir-capsule") 
os.makedirs(EXP_PATH, exist_ok=True)

import sys
sys.path.append(REPO_ROOT)

## Construct metadata from paths

In [28]:
# fname parser : ""<root>/<class>/<case_key>_<frame_key>.jpg"
def path_parser(fpath):
    """Parse metadata from image file path."""
    class_ = fpath.split("/")[-2]
    fname = fpath.split("/")[-1]
    case_id = fname.split("_")[0]
    # compound class and fname address multilabel collisions
    img_id = "_".join([class_, fname])
    return {
        "img_id" : img_id,
        "fname" : fname,
        "case_id" : case_id,
        "target" : class_,
    }

In [29]:
paths = glob(os.path.join(DATA_ROOT, '*', '*'))

rows = []
for path in paths:
    img_meta = path_parser(path)
    rows.append(img_meta)

metadata = pd.DataFrame(rows)

In [30]:
metadata.head()

Unnamed: 0,img_id,fname,case_id,target
0,Ampulla of vater_eb0203196e284797_1157.jpg,eb0203196e284797_1157.jpg,eb0203196e284797,Ampulla of vater
1,Ampulla of vater_eb0203196e284797_1158.jpg,eb0203196e284797_1158.jpg,eb0203196e284797,Ampulla of vater
2,Ampulla of vater_eb0203196e284797_1160.jpg,eb0203196e284797_1160.jpg,eb0203196e284797,Ampulla of vater
3,Ampulla of vater_eb0203196e284797_1167.jpg,eb0203196e284797_1167.jpg,eb0203196e284797,Ampulla of vater
4,Ampulla of vater_eb0203196e284797_1168.jpg,eb0203196e284797_1168.jpg,eb0203196e284797,Ampulla of vater


## Load metadata if available

In [22]:
metadata = pd.read_csv(META_PATH, sep=";")
display(metadata.head())

# filtering for sample task
select_class = ['']
metadata = metadata.loc[metadata.]

TARGET_KEY = "finding_class" # used to stratify and get y
GROUP_KEY = "video_id" # used for grouped splits

X, y = metadata.drop(columns=[TARGET_KEY]), metadata[TARGET_KEY]

Unnamed: 0,filename,video_id,frame_number,finding_category,finding_class,x1,y1,x2,y2,x3,y3,x4,y4
0,0728084c8da942d9_22803.jpg,0728084c8da942d9,22803,Luminal,Normal clean mucosa,,,,,,,,
1,0728084c8da942d9_22804.jpg,0728084c8da942d9,22804,Luminal,Normal clean mucosa,,,,,,,,
2,0728084c8da942d9_22805.jpg,0728084c8da942d9,22805,Luminal,Normal clean mucosa,,,,,,,,
3,0728084c8da942d9_22806.jpg,0728084c8da942d9,22806,Luminal,Normal clean mucosa,,,,,,,,
4,0728084c8da942d9_22807.jpg,0728084c8da942d9,22807,Luminal,Normal clean mucosa,,,,,,,,


## Stratified shuffle splits

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)

for i, (train_index, val_index) in enumerate(sss.split(X, y)):
    train = metadata.iloc[train_index]
    val = metadata.iloc[val_index]
    
    display(train.shape)
    display(val.shape)
    
    # train.to_csv(os.path.join(EXP_PATH, f"fold-{i}/train.csv", index=False)
    # val.to_csv(os.path.join(EXP_PATH, f"fold-{i}/val.csv", index=False)

(37798, 13)

(9450, 13)

(37798, 13)

(9450, 13)

## Grouped shuffle splits

In [19]:
from sklearn.model_selection import GroupShuffleSplit

X, y = metadata.drop(columns=[TARGET_KEY]), metadata[TARGET_KEY]
groups = metadata[GROUP_KEY]
gss = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=0)

for i, (train_index, val_index) in enumerate(gss.split(X, y, groups)):
    train = metadata.iloc[train_index]
    val = metadata.iloc[val_index]
    
    display(train.shape)
    display(val.shape)

    # train.to_csv(os.path.join(EXP_PATH, f"fold-{i}/train.csv", index=False)
    # val.to_csv(os.path.join(EXP_PATH, f"fold-{i}/val.csv", index=False)

(36019, 13)

(11229, 13)

(39094, 13)

(8154, 13)

## Grouped-stratified splits

In [21]:
from sklearn.model_selection import StratifiedGroupKFold

X, y = metadata.drop(columns=[TARGET_KEY]), metadata[TARGET_KEY]
groups = metadata[GROUP_KEY]
sgkf = StratifiedGroupKFold(n_splits=3)

for i, (train_index, val_index) in enumerate(sgkf.split(X, y, groups)):
    train = metadata.iloc[train_index]
    val = metadata.iloc[val_index]
    
    
    display(train.shape)
    display(val.shape)
    
        # train.to_csv(os.path.join(EXP_PATH, f"fold-{i}/train.csv", index=False)
    # val.to_csv(os.path.join(EXP_PATH, f"fold-{i}/val.csv", index=False)

(32146, 13)

(15102, 13)

(30067, 13)

(17181, 13)

(32283, 13)

(14965, 13)