In [1]:
import os
import numpy as np
import yaml
from pathlib import Path

DATA_DIR = 'datasets/pa3_malaria_7_classes_8_folds_temp1'
IMG_FOLD_PATH = 'all_train/images'
IMG_TEST_PATH = 'all_train/images'


Create txt img

In [2]:
def get_files(data_dir, img_path, txt_path='files.txt'):
    files = []
    for file in os.listdir(img_path):
        path = os.path.join(data_dir, img_path, file)
        path = path.replace('\\', '/')
        files.append(path)
    with open(txt_path, 'w') as f:
        for file in files:
            f.write(file + '\n')

get_files(DATA_DIR, IMG_FOLD_PATH, 'all_train.txt')
# get_files(DATA_DIR, IMG_TEST_PATH, 'test.txt')

Create folds

In [3]:
from sklearn.model_selection import KFold

K_FOLDS = 8
FOLDS_FOULDERS = 'folds'
YAML_PATH = 'data.yaml'
ALL_IMG_PATH = 'all_train.txt'

os.makedirs(FOLDS_FOULDERS, exist_ok=True)
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

In [4]:
with open(YAML_PATH, "r", encoding="utf8") as y:
    full_data_yaml = yaml.safe_load(y)
full_data_yaml

{'names': {0: 'TJ',
  1: 'TA',
  2: 'S',
  3: 'G',
  4: 'Healthy',
  5: 'Others',
  6: 'Difficult'},
 'train': '../all_train/images',
 'val': '../all_train/images'}

In [5]:
def read_image_paths(txt_file):
    with open(txt_file, 'r') as f:
        image_paths = [line.strip() for line in f.readlines()]
    return image_paths

def kfold_split(image_paths, kf):
    folds = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(image_paths)):
        train_paths = ['' + image_paths[i] for i in train_idx]
        val_paths = ['' + image_paths[i] for i in val_idx]
        folds.append((train_paths, val_paths))
        print(f"Fold {fold + 1}:")
        print("Train paths:", train_paths)
        print("Val paths:", val_paths)
        print("len train:", len(train_paths))
        print("len val:", len(val_paths))
        print("=" * 50)
    return folds

In [6]:
image_path = read_image_paths(ALL_IMG_PATH)
folds = kfold_split(image_path, kf)

Fold 1:
Train paths: ['datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/050Overlay007.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/050Overlay008.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/051Overlay002.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/051Overlay003.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/051Overlay004.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/055.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/064.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/065.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/067.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/068.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/069.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_train/images/070.jpg', 'datasets/pa3_malaria_7_classes_8_folds_temp1/all_tr

In [7]:
def create_img_folds_yaml(fold_foulder, folds, full_data_yaml):
    for fold, (train_paths, val_paths) in enumerate(folds):
        fold_foulder_path = os.path.join(fold_foulder, f"fold_{fold}")
        os.makedirs(fold_foulder_path, exist_ok=True)

        data_yaml = full_data_yaml.copy()
        with open(os.path.join(fold_foulder_path, f"train_{fold}.txt"), "w") as f:
            for path in train_paths:
                f.write(path + "\n")
        
        with open(os.path.join(fold_foulder_path, f"val_{fold}.txt"), "w") as f:
            for path in val_paths:
                f.write(path + "\n")
        
        data_yaml["train"] = f"train_{fold}.txt"
        data_yaml["val"] = f"val_{fold}.txt"
        # data_yaml["path"] = f"datasets/pa3_malaria_7_classes_5_folds"
        data_yaml.pop("test", None)
        with open(os.path.join(fold_foulder_path, f"data_{fold}.yaml"), "w") as f:
            yaml.dump(data_yaml, f)

create_img_folds_yaml(FOLDS_FOULDERS, folds, full_data_yaml)