In [2]:
import os
import pandas as pd
from pathlib import Path
import pickle
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
# Set working directory
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'/home/ubuntu/diabetes_detection'

In [4]:
# Import local libraries
from src.logger import set_logger as sl
from src.link_files import link_files

In [8]:
# Load t2d prs table for top 1% and for all participants
prs_top1 = pd.read_table("data/T2D_PMID_30297969.all_score_top_one", sep="\s+")
prs_all = pd.read_table("data/T2D_PMID_30297969.all_score", sep="\s+")

Unnamed: 0,FID,IID,Pt_1
0,2869246,2869246,0.030623
1,4217449,4217449,0.033024
2,2141632,2141632,0.034242
3,2056745,2056745,0.03212
4,1101678,1101678,0.031941


In [9]:
prs_top1

Unnamed: 0,FID,IID,Pt_1,T2D
0,5150517,5150517,7.404651,0
1,5884606,5884606,6.066198,0
2,1033089,1033089,5.800283,0
3,2865096,2865096,5.707216,0
4,1496831,1496831,5.703456,1
...,...,...,...,...
4869,5020936,5020936,2.390291,0
4870,2680115,2680115,2.390255,0
4871,3402320,3402320,2.390149,1
4872,3425981,3425981,2.390128,0


In [5]:
# Load true t2d fundus images
with open("data/pickles/all-t2d.pkl", "rb") as f:
    all_t2d = pickle.load(f)

fun_t2d = pd.DataFrame([i.split("_") for i in all_t2d], columns=["id","eye","take","replica"])
fun_t2d.id = fun_t2d.id.astype(str)

print(f"Fundus images for patients with T2D: {len(fun_t2d)}")

Fundus images for patients with T2D: 7411


In [6]:
# Patients on the top 1% with diagnosed T2D
prs_top1_t2d_id = prs_top1[prs_top1.T2D == 1].IID.astype(str).values
print(f"Patient with PRS on the 1st percentile and confirmed T2D: {len(prs_top1_t2d_id)}")

Patient with PRS on the 1st percentile and confirmed T2D: 682


In [7]:
# Get fundus for the top1%
prs_t2d = prs_top1[prs_top1.T2D == 1]
prs_t2d_iid = prs_t2d.IID.reset_index()

In [8]:
# List of fundus images with PRS >= 1%
fun_highrisk_t2d = fun_t2d[fun_t2d.id.isin(prs_t2d_iid.IID.astype(str).values)]
fun_lowrisk_t2d = fun_t2d[~(fun_t2d.id.isin(prs_t2d_iid.IID.astype(str).values))]
print(len(fun_highrisk_t2d))
print(len(fun_lowrisk_t2d))

212
7199


In [9]:
# Select randomly from the 7199 fundus with moderate/low risk 
selected, _ = train_test_split(fun_lowrisk_t2d, train_size=450, random_state=42)
print(len(selected))

450


In [10]:
# Concatenate together high and moderate risk
tograde = pd.concat([fun_highrisk_t2d, selected])
tograde

Unnamed: 0,id,eye,take,replica
11,3749963,21016,0,0.png
13,5023033,21015,0,0.png
16,4446007,21015,0,0.png
37,1810308,21016,0,0.png
55,2814310,21015,0,0.png
...,...,...,...,...
3877,5483296,21015,0,0.png
5346,1197618,21016,0,0.png
5382,1340232,21015,0,0.png
5551,1648394,21016,1,0.png


In [11]:
tograde = tograde.apply(lambda x: "_".join(x.values), axis=1)
tograde

11      3749963_21016_0_0.png
13      5023033_21015_0_0.png
16      4446007_21015_0_0.png
37      1810308_21016_0_0.png
55      2814310_21015_0_0.png
                ...          
3877    5483296_21015_0_0.png
5346    1197618_21016_0_0.png
5382    1340232_21015_0_0.png
5551    1648394_21016_1_0.png
891     1533538_21016_0_0.png
Length: 662, dtype: object

In [12]:
tograde.to_csv("data/t2d_2grade.txt")

In [13]:
# Link fundus files to /data folder
link_files("/mnt/ukbb/raw/", "data/2grade/", tograde.values)

100%|██████████| 662/662 [00:00<00:00, 66097.63it/s]


# After Cleaning

In [70]:
# After manually cleaning the images to a total of 500 we then  link the fundus to a /data/2grade_clean
tograde_clean = pd.read_csv("data/2grade_clean.txt", header=None)

# Get a DF
fun_2grade_clean = pd.DataFrame([i.split("_") for i in tograde_clean[0].values], columns=["id","eye","take","replica"])
fun_2grade_clean.id = fun_2grade_clean.id.astype(str)

In [72]:
# Link fundus files to /data folder
try:
    link_files("/mnt/ukbb/raw/", "data/2grade_clean/t2d/", tograde_clean[0].values)
except FileExistsError:
    print("No problem, already there")

  0%|          | 0/500 [00:00<?, ?it/s]

No problem, already there





In [73]:
fun_highrisk_t2d_set = set(fun_highrisk_t2d.id)

In [74]:
# Label for high-risk of T2D on clean data
prs_highrisk = fun_2grade_clean.apply(lambda x: 1 if x.id in fun_highrisk_t2d_set else 0, axis=1)
tograde_clean["highrisk_prs"] = prs_highrisk
tograde_clean

Unnamed: 0,0,highrisk_prs
0,1013324_21015_0_0.png,0
1,1016734_21015_0_0.png,0
2,1025300_21016_0_0.png,0
3,1029035_21015_0_0.png,0
4,1031736_21015_1_0.png,1
...,...,...
495,5945604_21015_1_0.png,0
496,5945720_21016_0_0.png,0
497,5951754_21016_0_0.png,0
498,5979014_21015_0_0.png,0


In [77]:
# Save file with highrisk labels
tograde_clean.to_csv("data/prs_labels.txt")

# Test images with CNN