In [1]:
# Imports
import os
import pickle
import pandas as pd
from tqdm import tqdm

In [2]:
# Set Working directory
os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

/home/ubuntu/master_thesis


In [3]:
# Load local libraries
from src.get_info_from_name import get_info_from_name

# DATA by Image

In [4]:
print("Total number of images")
all_im = os.listdir("/mnt/ukbb/raw")
print(len(all_im))

Total number of images
174986


In [5]:
with open("/home/ubuntu/master_thesis/data/pickles/all-females.pkl", "rb") as f:
    females = pickle.load(f)
print("Total number of images for females")
print(len(females))
print(len(females)/len(all_im))

Total number of images for females
93753
0.5357742905146697


In [6]:
with open("/home/ubuntu/master_thesis/data/pickles/all-males.pkl", "rb") as f:
    males = pickle.load(f)
print("Total number of images for males")
print(len(males))
print(len(males)/len(all_im))

Total number of images for males
81233
0.46422570948533026


In [7]:
len(all_im) == len(females)+len(males)

True

# DATA by Patient

In [8]:
# Read UK Biobank frame with sex information
ukbb_frame = pd.read_csv("./data/fundus_sex.csv")
ukbb_frame.head()

Unnamed: 0,file,sex
0,2159201_21015_0_0.png,male
1,1076073_21016_0_0.png,male
2,5110590_21015_0_0.png,male
3,1662118_21015_0_0.png,male
4,5300490_21016_0_0.png,male


In [9]:
ukbb_info = ukbb_frame["file"].apply(lambda x: get_info_from_name(x))
ukbb_info.head()

Unnamed: 0,patient_id,eye_side,visit,replica
0,2159201,left,0,0
1,1076073,right,0,0
2,5110590,left,0,0
3,1662118,left,0,0
4,5300490,right,0,0


In [10]:
ukbb_all  = pd.concat([ukbb_frame, ukbb_info], axis=1)
ukbb_all.head()

Unnamed: 0,file,sex,patient_id,eye_side,visit,replica
0,2159201_21015_0_0.png,male,2159201,left,0,0
1,1076073_21016_0_0.png,male,1076073,right,0,0
2,5110590_21015_0_0.png,male,5110590,left,0,0
3,1662118_21015_0_0.png,male,1662118,left,0,0
4,5300490_21016_0_0.png,male,5300490,right,0,0


In [11]:
ukbb_all_sorted = ukbb_all.sort_values(by=["file"])
ukbb_all_sorted.to_csv("./data/ukbb_metadata.csv", index=False)

In [20]:
# Let's check how the data is splitted between left and righ eye
print(ukbb_all_sorted.eye_side.value_counts())
print(ukbb_all_sorted.eye_side.value_counts()/len(all_im))

right    88156
left     86830
Name: eye_side, dtype: int64
right    0.503789
left     0.496211
Name: eye_side, dtype: float64


In [24]:
# Check how the data is splitted between females and males
print(ukbb_all_sorted.sex.value_counts())
print(ukbb_all_sorted.sex.value_counts()/len(all_im))

female    93753
male      81233
Name: sex, dtype: int64
female    0.535774
male      0.464226
Name: sex, dtype: float64


In [38]:
#Get total number of images
ukbb_all_sorted.file.size

174986

In [39]:
# Get number of patients
ukbb_all_sorted.patient_id.unique().size

85623

In [48]:
# Get number of female patients
print(ukbb_all[ukbb_all.sex=="female"].patient_id.unique().size)
print(ukbb_all[ukbb_all.sex=="female"].patient_id.unique().size/ukbb_all_sorted.patient_id.unique().size)

45948
0.5366315125608774


In [49]:
# Get number of male patients
print(ukbb_all[ukbb_all.sex=="male"].patient_id.unique().size)
print(ukbb_all[ukbb_all.sex=="male"].patient_id.unique().size/ukbb_all_sorted.patient_id.unique().size)

39675
0.4633684874391227
