In [1]:
import os
from os.path import abspath
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Set working directory
os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/home/ubuntu/master_thesis


In [3]:
# Load libraries
ukbb = pd.read_csv("./data/ukbb_metadata_qc.csv")
ukbb.head()

Unnamed: 0,file,sex,patient_id,eye_side,visit,replica,cleaned
0,1000180_21015_0_0.png,female,1000180,left,0,0,True
1,1000180_21016_0_0.png,female,1000180,right,0,0,True
2,1000303_21015_1_0.png,female,1000303,left,1,0,True
3,1000303_21016_1_0.png,female,1000303,right,1,0,False
4,1000390_21015_1_0.png,male,1000390,left,1,0,True


In [4]:
# Keep only images with reaasonable quality
clean = ukbb[ukbb["cleaned"]]
clean.head()

Unnamed: 0,file,sex,patient_id,eye_side,visit,replica,cleaned
0,1000180_21015_0_0.png,female,1000180,left,0,0,True
1,1000180_21016_0_0.png,female,1000180,right,0,0,True
2,1000303_21015_1_0.png,female,1000303,left,1,0,True
4,1000390_21015_1_0.png,male,1000390,left,1,0,True
5,1000390_21016_1_0.png,male,1000390,right,1,0,True


In [5]:
# Get female/male distribution
print(sum(clean.sex.value_counts()))
print(clean.sex.value_counts())
print(clean.sex.value_counts()/clean.shape[0])

154931
female    84021
male      70910
Name: sex, dtype: int64
female    0.542312
male      0.457688
Name: sex, dtype: float64


In [6]:
# Get 60,000 females and 60,000 males (120, 000) Fundus for training.
# This will be roughly the 77.45% of the cleaned data

females = clean[clean.sex == "female"]
males   = clean[clean.sex == "male"]

print(females.shape)
print(males.shape)

train_females, valtest_females = train_test_split(females, train_size=60000, random_state=42)
train_males, valtest_males = train_test_split(males, train_size=60000, random_state=42)

print(train_females.shape)
print(train_males.shape)

# Concatenate females and males
train = pd.concat([train_females, train_males])
train = train.sort_values(by="file")

print(train.shape)
print(train.shape[0]/clean.shape[0])

(84021, 7)
(70910, 7)
(60000, 7)
(60000, 7)
(120000, 7)
0.7745383428752154


In [7]:
# The rest of the data will be splitted 50/50 for validation and testing.
# This will account for 22.54% of the cleaned data.
print(valtest_females.shape)
print(valtest_males.shape)

(24021, 7)
(10910, 7)


In [8]:
# We are going to ignore some female images to keep the female to male ratio 50/50
print(valtest_females.shape[0]-valtest_females.sample(n=valtest_males.shape[0], random_state=42).shape[0])
print((valtest_females.shape[0]-valtest_females.sample(n=valtest_males.shape[0], random_state=42).shape[0])/clean.shape[0])

13111
0.0846247684453079


In [9]:
# Select equal ammounts of females and males for the validation and test 
# This way the ratio will be 50/50 female to male
valtest = pd.concat([valtest_females.sample(n=valtest_males.shape[0], random_state=42), valtest_males])
valtest = valtest.sort_values(by="file")
print(valtest.shape)
print(valtest.shape[0]/clean.shape[0])

(21820, 7)
0.14083688867947666


In [10]:
# Female/Male ratio on validation-test 
print(valtest.sex.value_counts())
print(valtest.sex.value_counts()/valtest.shape[0])

female    10910
male      10910
Name: sex, dtype: int64
female    0.5
male      0.5
Name: sex, dtype: float64


In [11]:
# Split validation-test into validation and test
vald, test = train_test_split(valtest, train_size=.50, random_state=42)
print(vald.shape)
print(vald.shape[0]/clean.shape[0])
print(test.shape)
print(test.shape[0]/clean.shape[0])

(10910, 7)
0.07041844433973833
(10910, 7)
0.07041844433973833


In [12]:
# The female/male ratio on the validation and test set 
print(vald.sex.value_counts())
print(vald.sex.value_counts()/vald.shape[0])

print(test.sex.value_counts())
print(test.sex.value_counts()/test.shape[0])

female    5495
male      5415
Name: sex, dtype: int64
female    0.503666
male      0.496334
Name: sex, dtype: float64
male      5495
female    5415
Name: sex, dtype: int64
male      0.503666
female    0.496334
Name: sex, dtype: float64


In [13]:
# Save train-validation-test subsets as csv 
train.to_csv("./data/ukbb_metadata_qc_train.csv")
vald.to_csv("./data/ukbb_metadata_qc_validation.csv")
test.to_csv("./data/ukbb_metadata_qc_test.csv")