## Knee Joint Segmentation Dataset Setup

In [9]:
import numpy as np
import pandas as pd
import regex as re
import os
import shutil

import yaml
from zipfile import ZipFile

In [12]:
# Unzip Dataset
data_folder = "ISVC_Segmentation"
DATA = "/home/datascience/ISVC_2023_Tutorial_XAI_Few_Shot_on_the_Cloud/SampleDataset(s)/"

In [13]:
if not os.path.exists(os.path.join(DATA, data_folder)):
    with ZipFile(f"{DATA}/{data_folder}.zip", 'r') as zipf:
        zipf.extractall(f"{DATA}")

In [14]:
from sklearn.model_selection import train_test_split
def generate_datasets(root_dir, image_dir, annot_dir,  task="localization"):
    img_dir = os.path.join(root_dir, image_dir)
    annot_dir = os.path.join(root_dir, annot_dir)
    
    # PID regex
    pid_reg = re.compile("9[0-9]{6}")
    
    # Image names + annotation names match
    records = [[pid_reg.findall(img)[-1], img, img if task != "localization" else img.replace(".jpg", ".xml")] 
               for img in os.listdir(img_dir) if ".DS_Store" not in img and ".ipynb_checkpoints" not in img]

    data_records = pd.DataFrame(records, columns=["pid", "images", "masks"])

    train, test = train_test_split(data_records.pid.unique(), test_size=0.5, random_state=42)
    valid, test = train_test_split(test, test_size=0.5, random_state=42)

    train = data_records[data_records.pid.isin(train)].reset_index(drop=True)
    valid = data_records[data_records.pid.isin(valid)].reset_index(drop=True)
    test = data_records[data_records.pid.isin(test)].reset_index(drop=True)

    return train, valid, test

In [30]:
train, valid, test = generate_datasets(f"{DATA}/{data_folder}", "Images", "Labels", task="segmentation")

In [31]:
train

Unnamed: 0,pid,images,masks
0,9988891,9988891R.png,9988891R.png
1,9975778,9975778R.png,9975778R.png
2,9999510,9999510L.png,9999510L.png
3,9975778,9975778L.png,9975778L.png
4,9993261,9993261L.png,9993261L.png
5,9978813,9978813L.png,9978813L.png
6,9996345,9996345R.png,9996345R.png
7,9989700,9989700R.png,9989700R.png
8,9989700,9989700L.png,9989700L.png
9,9983798,9983798R.png,9983798R.png


In [17]:
def get_few_shot_sample(dataset, k=1, random_state=42):
    if k > len(dataset):
        return dataset

    return dataset.sample(k, random_state=random_state).reset_index(drop=True)

In [18]:
train_few = get_few_shot_sample(train, k=10)
valid_few = get_few_shot_sample(valid, k=10)

In [40]:
few_shot_dir = "/home/datascience/ISVC_2023_Tutorial_XAI_Few_Shot_on_the_Cloud/SampleDataset(s)/Few_Shot/Segmentation"
shot = 10
if not os.path.exists(few_shot_dir):
    os.makedirs(few_shot_dir)

In [41]:
os.makedirs(f"{few_shot_dir}/{shot}-shot/train/images")
os.makedirs(f"{few_shot_dir}/{shot}-shot/train/labels")

os.makedirs(f"{few_shot_dir}/{shot}-shot/valid/images")
os.makedirs(f"{few_shot_dir}/{shot}-shot/valid/labels")

os.makedirs(f"{few_shot_dir}/{shot}-shot/test/images")
os.makedirs(f"{few_shot_dir}/{shot}-shot/test/labels")


In [42]:
for pid in train_few.images:
    shutil.copy(f"{DATA}/{data_folder}/Images/{pid}", f"{few_shot_dir}/{shot}-shot/train/images/{pid}")
    shutil.copy(f"{DATA}/{data_folder}/Annotations/{pid}", f"{few_shot_dir}/{shot}-shot/train/labels/{pid}")

In [43]:
for pid in valid_few.images:
    shutil.copy(f"{DATA}/{data_folder}/Images/{pid}", f"{few_shot_dir}/{shot}-shot/valid/images/{pid}")
    shutil.copy(f"{DATA}/{data_folder}/Annotations/{pid}", f"{few_shot_dir}/{shot}-shot/valid/labels/{pid}")

In [44]:
for pid in test.images:
    shutil.copy(f"{DATA}/{data_folder}/Images/{pid}", f"{few_shot_dir}/{shot}-shot/test/images/{pid}")
    shutil.copy(f"{DATA}/{data_folder}/Annotations/{pid}", f"{few_shot_dir}/{shot}-shot/test/labels/{pid}")