# Preprocessing

In [1]:
#requirements:
# opendatasets, pandas, openpyxl

import opendatasets as od
import pandas as pd
import zipfile
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

DOWNLOAD_DIR = 'Datasets'
DATASET_LOC = DOWNLOAD_DIR+"/ocular-disease-recognition-odir5k/ODIR-5K/ODIR-5k/"

## Downloading the Dataset

In [2]:
od.download_kaggle_dataset('https://www.kaggle.com/datasets/andrewmvd/ocular-disease-recognition-odir5k', data_dir = DOWNLOAD_DIR, force = False)

Skipping, found downloaded files in "Datasets\ocular-disease-recognition-odir5k" (use force=True to force download)


## Importing the dataset

The dataset has 5000 fundus images 

In [3]:
df = pd.read_excel(DATASET_LOC + 'data.xlsx', index_col = 'ID')
df.head()

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1


In [4]:
df[['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']].sum()

N    1140
D    1128
G     215
C     212
A     164
H     103
M     174
O     979
dtype: int64

In [5]:
df[df.D == 1]

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
6,60,Male,6_left.jpg,6_right.jpg,macular epiretinal membrane,moderate non proliferative retinopathy，epireti...,0,1,0,0,0,0,0,1
7,60,Female,7_left.jpg,7_right.jpg,drusen,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4683,58,Male,4683_left.jpg,4683_right.jpg,normal fundus,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0
4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0
4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0


In [6]:
#extracting left and right indices for diabetic retinopathy
left_right_ids = df[df.D == 1][['Left-Diagnostic Keywords', 'Right-Diagnostic Keywords']]
left_right_ids['Left-Diagnostic Keywords'] = left_right_ids['Left-Diagnostic Keywords'].str.extract(r".*(retinopathy).*", re.IGNORECASE)
left_right_ids['Right-Diagnostic Keywords'] = left_right_ids['Right-Diagnostic Keywords'].str.extract(r".*(retinopathy).*", re.IGNORECASE)

left = left_right_ids['Left-Diagnostic Keywords']
right = left_right_ids['Right-Diagnostic Keywords']

left_idx = left[~left.isna()].index
right_idx = right[~right.isna()].index

In [7]:
left_diabetes = df.loc[left_idx]
right_diabetes = df.loc[right_idx]

In [8]:
(left_diabetes['D'] == 1).all(), (right_diabetes['D'] == 1).all()

(True, True)

In [9]:
left_diabetes = left_diabetes[['Patient Age', 'Patient Sex', 'Left-Fundus', 'Left-Diagnostic Keywords']]
left_diabetes['Eye'] = 'left'
left_diabetes.rename(columns = {"Left-Fundus": "Image", "Left-Diagnostic Keywords": "Diagnostic Keywords"}, inplace = True)
left_diabetes.reset_index(inplace = True, drop = False)

right_diabetes = right_diabetes[['Patient Age', 'Patient Sex', 'Right-Fundus', 'Right-Diagnostic Keywords']]
right_diabetes['Eye'] = 'right'
right_diabetes.rename(columns = {"Right-Fundus": "Image", "Right-Diagnostic Keywords": "Diagnostic Keywords"}, inplace = True)
right_diabetes.reset_index(inplace = True, drop = False)


In [10]:
diabetes = pd.concat([left_diabetes, right_diabetes]).reset_index(drop = True)
diabetes.to_csv(DATASET_LOC + "diabetic_images.csv", index = False)

diabetes

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye
0,2,42,Male,2_left.jpg,laser spot，moderate non proliferative retinopathy,left
1,5,50,Female,5_left.jpg,moderate non proliferative retinopathy,left
2,11,60,Female,11_left.jpg,moderate non proliferative retinopathy，hyperte...,left
3,19,45,Male,19_left.jpg,mild nonproliferative retinopathy,left
4,22,55,Female,22_left.jpg,moderate non proliferative retinopathy，laser spot,left
...,...,...,...,...,...,...
1815,4682,45,Male,4682_right.jpg,moderate non proliferative retinopathy,right
1816,4683,58,Male,4683_right.jpg,mild nonproliferative retinopathy,right
1817,4686,63,Male,4686_right.jpg,proliferative diabetic retinopathy,right
1818,4688,42,Male,4688_right.jpg,moderate non proliferative retinopathy,right


## Normal eyes



In [14]:
normal

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
8,59,Male,8_left.jpg,8_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
84,51,Female,84_left.jpg,84_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
191,51,Female,191_left.jpg,191_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
394,63,Male,394_left.jpg,394_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3485,70,Male,3485_left.jpg,3485_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
4149,55,Male,4149_left.jpg,4149_right.jpg,low image quality,low image quality,1,0,0,0,0,0,0,0
4290,51,Male,4290_left.jpg,4290_right.jpg,low image quality,normal fundus,1,0,0,0,0,0,0,0
4571,51,Male,4571_left.jpg,4571_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0


In [21]:
normal = df[df['N'] == 1]
left_normal = normal[['Patient Age','Patient Sex', 
                      'Left-Fundus', 
                      'Left-Diagnostic Keywords']].reset_index().rename(columns = {"Left-Fundus":"Image",
                                                                        "Left-Diagnostic Keywords" : "Diagnostic Keywords"})
left_normal['Eye'] = 'left'

right_normal = normal[['Patient Age','Patient Sex', 
                      'Right-Fundus', 
                      'Right-Diagnostic Keywords']].reset_index().rename(columns = {"Right-Fundus":"Image",
                                                                        "Right-Diagnostic Keywords" : "Diagnostic Keywords"})
right_normal['Eye'] = 'right'

normal = pd.concat([left_normal, right_normal]).reset_index(drop = True)
normal

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye
0,1,57,Male,1_left.jpg,normal fundus,left
1,8,59,Male,8_left.jpg,normal fundus,left
2,84,51,Female,84_left.jpg,normal fundus,left
3,191,51,Female,191_left.jpg,normal fundus,left
4,394,63,Male,394_left.jpg,normal fundus,left
...,...,...,...,...,...,...
2275,3485,70,Male,3485_right.jpg,normal fundus,right
2276,4149,55,Male,4149_right.jpg,low image quality,right
2277,4290,51,Male,4290_right.jpg,normal fundus,right
2278,4571,51,Male,4571_right.jpg,normal fundus,right


In [22]:
normal['Diagnostic Keywords'].value_counts()

normal fundus                    2056
lens dust，normal fundus           174
normal fundus，lens dust            43
low image quality                   3
normal fundus，normal fundus         2
lens dust，lens dust，lens dust       1
lens dust，lens dust                 1
Name: Diagnostic Keywords, dtype: int64

In [24]:
normal.to_csv(DATASET_LOC + "normal_images.csv", index = False)