In [1]:
import os
from os.path import abspath
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
# ROOT contais the project directory 
os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/home/ubuntu/diabetes_detection


# Split data between Diabetic and Non-diabetic

In [3]:
# Create a list with all the fundus images
fun_all = os.listdir("/mnt/ukbb/raw/")

In [4]:
# Create a set with all the ids of patients with type 2 diabetes (t2d)
with open(abspath("data/E11-id.txt"), "r") as FILE:
    ids_t2d = set([line.strip() for line in FILE])

In [5]:
# Get fundus images with t2d
fun_t2d = [f for f in fun_all if f[:-4].split("_")[0] in ids_t2d]

In [6]:
# Get ids of the t2d patients with fundus images
ids_t2d_wfun = set([f[:-4].split("_")[0] for f in fun_t2d])

In [7]:
print(f"all images:      {len(fun_all)}  {100*len(fun_all)/len(fun_all):.2f} %")
print(f"t2d patients:     {len(ids_t2d)}  {100*len(ids_t2d)/len(ids_t2d):.2f} %")
print(f"t2d images:        {len(fun_t2d)}    {100*len(fun_t2d)/len(fun_all):.2f} %")
print(f"t2d patients w/im: {len(ids_t2d_wfun)}   {100*len(ids_t2d_wfun)/len(ids_t2d):.2f} %")

all images:      174986  100.00 %
t2d patients:     24143  100.00 %
t2d images:        7411    4.24 %
t2d patients w/im: 3655   15.14 %


In [8]:
fun_nt2d = set(fun_all) -set(fun_t2d)
fun_nt2d = list(fun_nt2d)

print(len(set(fun_all)))
print(len(set(fun_t2d)))
print(len(set(fun_nt2d)))

174986
7411
167575


# Split data between Train/Test

In [9]:
# Splitting of t2d fundus images to a 80:20
fun_t2d_train, fun_t2d_test = train_test_split(fun_t2d, train_size=0.8, random_state=42)
print(f"td2 train: {len(fun_t2d_train)}")
print(f"td2 test:  {len(fun_t2d_test)}")

td2 train: 5928
td2 test:  1483


In [10]:
# Spliting of non-t2d fundus images on a ration 5:1  non-t2d:t2d
fun_nt2d_train, fun_nt2d_test = train_test_split(fun_nt2d, train_size=len(fun_t2d_train)*4, test_size=len(fun_t2d_test), random_state=42)
print(f"ntd2 train: {len(fun_nt2d_train)}")
print(f"ntd2 test:   {len(fun_nt2d_test)}")

ntd2 train: 23712
ntd2 test:   1483


In [11]:
# Further split the test set into validation/test
fun_t2d_validation, fun_t2d_test = train_test_split(fun_t2d_test, test_size=0.5, random_state=42)
fun_nt2d_validation, fun_nt2d_test = train_test_split(fun_nt2d_test, test_size=0.5, random_state=42)
print(f"ntd2 train: {len(fun_t2d_validation)}")
print(f"ntd2 test:   {len(fun_nt2d_validation)}")

ntd2 train: 741
ntd2 test:   741


# Save Data

In [12]:
# Save to pickle lists of id's for train, finetune and test
# T2D
with open("data/pickles/train-t2d.pkl", "wb") as f:
    pickle.dump(fun_t2d_train, f)

with open("data/pickles/validation-t2d.pkl", "wb") as f:
    pickle.dump(fun_t2d_validation, f)

with open("data/pickles/test-t2d.pkl", "wb") as f:
    pickle.dump(fun_t2d_test, f)

# Non T2D
with open("data/pickles/train-nont2d.pkl", "wb") as f:
    pickle.dump(fun_nt2d_train, f)

with open("data/pickles/validation-nont2d.pkl", "wb") as f:
    pickle.dump(fun_nt2d_validation, f)

with open("data/pickles/test-nont2d.pkl", "wb") as f:
    pickle.dump(fun_nt2d_test, f)

In [13]:
# Save also the list of all t2d and non t2d
with open("data/pickles/all-t2d.pkl", "wb") as f:
    pickle.dump(fun_t2d, f)

with open("data/pickles/all-nont2d.pkl", "wb") as f:
    pickle.dump(fun_nt2d, f)