# Preprocessing

In [94]:
#requirements:
# opendatasets, pandas, openpyxl

import opendatasets as od
import pandas as pd
import zipfile
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
DOWNLOAD_DIR = 'Datasets'
DATASET_LOC = DOWNLOAD_DIR+"/ocular-disease-recognition-odir5k/ODIR-5K/ODIR-5k/"

CSV_LOC = DOWNLOAD_DIR + '/ocular-disease-recognition-odir5k/'

TRAIN_RATIO = 0.7
TEST_RATIO = 0.15
VAL_RATIO = (1 - TRAIN_RATIO - TEST_RATIO)/TRAIN_RATIO

RANDOM_SEED = 123456

## Downloading the Dataset

In [95]:
od.download_kaggle_dataset('https://www.kaggle.com/datasets/andrewmvd/ocular-disease-recognition-odir5k', data_dir = DOWNLOAD_DIR, force = False)

Skipping, found downloaded files in "Datasets\ocular-disease-recognition-odir5k" (use force=True to force download)


## Importing the dataset

The dataset has 5000 fundus images 

In [96]:
df = pd.read_excel(DATASET_LOC + 'data.xlsx', index_col = 'ID')
df.head()

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1


In [97]:
df[['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']].sum()

N    1140
D    1128
G     215
C     212
A     164
H     103
M     174
O     979
dtype: int64

## Extract per eye

In [98]:
# diabetes: (diabetic|non( |)proliferative retinopathy)
# normal : N == 1
# glaucoma : glaucoma in keywords
# cataract : cataract in keywords
# armd : age-related macular degeneration
# hypertension: hypertensive retinopathy
# m: myopia
# 
df_temp = df.reset_index(drop = False)
df_left = df_temp[['ID', 'Patient Age', 'Patient Sex', 'Left-Fundus', 'Left-Diagnostic Keywords']]
df_right = df_temp[['ID', 'Patient Age', 'Patient Sex', 'Right-Fundus', 'Right-Diagnostic Keywords']]

In [99]:
def process_keyword(description:str):
    '''Credit to Jordi Corbilla
    
    Link:
    https://github.com/JordiCorbilla/ocular-disease-intelligent-recognition-deep-learning/blob/master/odir_rule_engine.py
    
    Function to process each eye's labels separately.'''
    list_of_keywords = list(map(str.strip, description.lower().split(', ')))
    labs = dict.fromkeys(["N", "D", "G", "C", "A", "H", "M", "O", "NOT DECISIVE"], 0)
    # to discard image if it's not decisive
    for kw in list_of_keywords:
        if('normal fundus' in kw):
            labs['N'] = 1
        elif 'diabetic retinopathy' in kw or 'proliferative retinopathy' in kw:
            labs['D'] = 1
        elif 'glaucoma' in kw:
            labs['G'] = 1
        elif 'cataract' in kw:
            labs['C'] = 1
        elif 'macular degeneration' in kw:
            labs['A'] = 1
        elif 'hypertensive retinopathy' in kw:
            labs['H'] = 1
        elif "myopia" in kw:
            labs['M'] = 1
        else:
            if kw not in ['anterior segment image', 'no fundus image']:
                labs['O'] = 1
            if kw in ["lens dust", "optic disk photographically invisible", "low image quality", "image offset"]:
                labs['NOT DECISIVE'] = 1
    return labs

In [100]:
df_left = df_left.rename(columns = {"Left-Diagnostic Keywords": "Keywords", "Left-Fundus": "Image"})
df_left['eye'] = 'left'
df_right = df_right.rename(columns = {"Right-Diagnostic Keywords": "Keywords", "Right-Fundus": "Image"})
df_right['eye'] = 'right'
df_single_eye = pd.concat([df_left, df_right])
df_single_eye = df_single_eye.reset_index(drop = True)

pd.DataFrame(df_single_eye['Keywords'].apply(process_keyword).tolist())

df_single_eye['Keywords'] = df_single_eye['Keywords'].apply(process_keyword)

df_single_eye = pd.concat([df_single_eye.drop(columns='Keywords'), pd.DataFrame(df_single_eye['Keywords'].tolist())], axis = 1)
df_single_eye

Unnamed: 0,ID,Patient Age,Patient Sex,Image,eye,N,D,G,C,A,H,M,O,NOT DECISIVE
0,0,69,Female,0_left.jpg,left,0,0,0,1,0,0,0,0,0
1,1,57,Male,1_left.jpg,left,1,0,0,0,0,0,0,0,0
2,2,42,Male,2_left.jpg,left,0,1,0,0,0,0,0,0,0
3,3,66,Male,3_left.jpg,left,1,0,0,0,0,0,0,0,0
4,4,53,Male,4_left.jpg,left,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,4686,63,Male,4686_right.jpg,right,0,1,0,0,0,0,0,0,0
6996,4688,42,Male,4688_right.jpg,right,0,1,0,0,0,0,0,0,0
6997,4689,54,Male,4689_right.jpg,right,1,0,0,0,0,0,0,0,0
6998,4690,57,Male,4690_right.jpg,right,0,1,0,0,0,0,0,0,0


In [101]:
df_single_eye['NOT DECISIVE'].value_counts()

0    6975
1      25
Name: NOT DECISIVE, dtype: int64

In [105]:
# as per the code given by Jordi Corbilla, we blacklist these

# His explanation is as below
'''The background of the following images is quite different from the rest ones. They are fundus images
uploaded from the hospital: We are sure that these images are preprocessed. You can decide by yourself
whether or not to train these images in the model'''

blacklist = ['2174_right.jpg', '2175_left.jpg', '2176_left.jpg', '2177_left.jpg', '2177_right.jpg',
                     '2178_right.jpg', '2179_left.jpg', '2179_right.jpg', '2180_left.jpg', '2180_right.jpg',
                     '2181_left.jpg', '2181_right.jpg', '2182_left.jpg', '2182_right.jpg', '2957_left.jpg',
                     '2957_right.jpg']


df_single_eye.to_csv(CSV_LOC + "dataset_single_eye.csv", index = False)
df_blacklisted = df_single_eye[df_single_eye['Image'].isin(blacklist)]
df_blacklisted.to_csv(CSV_LOC + 'dataset_single_eye_blacklisted.csv', index = False)

In [5]:
df[df.D == 1]

Unnamed: 0_level_0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
6,60,Male,6_left.jpg,6_right.jpg,macular epiretinal membrane,moderate non proliferative retinopathy，epireti...,0,1,0,0,0,0,0,1
7,60,Female,7_left.jpg,7_right.jpg,drusen,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4683,58,Male,4683_left.jpg,4683_right.jpg,normal fundus,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0
4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0
4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0


In [6]:
#extracting left and right indices for diabetic retinopathy
left_right_ids = df[df.D == 1][['Left-Diagnostic Keywords', 'Right-Diagnostic Keywords']]
left_right_ids['Left-Diagnostic Keywords'] = left_right_ids['Left-Diagnostic Keywords'].str.extract(r".*(retinopathy).*", re.IGNORECASE)
left_right_ids['Right-Diagnostic Keywords'] = left_right_ids['Right-Diagnostic Keywords'].str.extract(r".*(retinopathy).*", re.IGNORECASE)

left = left_right_ids['Left-Diagnostic Keywords']
right = left_right_ids['Right-Diagnostic Keywords']

left_idx = left[~left.isna()].index
right_idx = right[~right.isna()].index

In [7]:
left_diabetes = df.loc[left_idx]
right_diabetes = df.loc[right_idx]

In [8]:
(left_diabetes['D'] == 1).all(), (right_diabetes['D'] == 1).all()

(True, True)

In [9]:
left_diabetes = left_diabetes[['Patient Age', 'Patient Sex', 'Left-Fundus', 'Left-Diagnostic Keywords']]
left_diabetes['Eye'] = 'left'
left_diabetes.rename(columns = {"Left-Fundus": "Image", "Left-Diagnostic Keywords": "Diagnostic Keywords"}, inplace = True)
left_diabetes.reset_index(inplace = True, drop = False)

right_diabetes = right_diabetes[['Patient Age', 'Patient Sex', 'Right-Fundus', 'Right-Diagnostic Keywords']]
right_diabetes['Eye'] = 'right'
right_diabetes.rename(columns = {"Right-Fundus": "Image", "Right-Diagnostic Keywords": "Diagnostic Keywords"}, inplace = True)
right_diabetes.reset_index(inplace = True, drop = False)


In [10]:
diabetes = pd.concat([left_diabetes, right_diabetes]).reset_index(drop = True)
diabetes['split'] = None

diabetes

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,2,42,Male,2_left.jpg,laser spot，moderate non proliferative retinopathy,left,
1,5,50,Female,5_left.jpg,moderate non proliferative retinopathy,left,
2,11,60,Female,11_left.jpg,moderate non proliferative retinopathy，hyperte...,left,
3,19,45,Male,19_left.jpg,mild nonproliferative retinopathy,left,
4,22,55,Female,22_left.jpg,moderate non proliferative retinopathy，laser spot,left,
...,...,...,...,...,...,...,...
1815,4682,45,Male,4682_right.jpg,moderate non proliferative retinopathy,right,
1816,4683,58,Male,4683_right.jpg,mild nonproliferative retinopathy,right,
1817,4686,63,Male,4686_right.jpg,proliferative diabetic retinopathy,right,
1818,4688,42,Male,4688_right.jpg,moderate non proliferative retinopathy,right,


In [11]:
## Splitting into test, train, val
diabetes_train, diabetes_test = train_test_split(diabetes, random_state = RANDOM_SEED, test_size = TEST_RATIO)
diabetes_train, diabetes_val = train_test_split(diabetes_train, random_state= RANDOM_SEED, test_size = VAL_RATIO)

train_ind = diabetes_train.index
test_ind = diabetes_test.index
val_ind = diabetes_val.index

In [12]:
diabetes.loc[train_ind, 'split'] = 'train'
diabetes.loc[test_ind, 'split'] = 'test'
diabetes.loc[val_ind, 'split'] = 'val'
diabetes.to_csv(CSV_LOC + "diabetes_leakage.csv", index = False)

diabetes

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,2,42,Male,2_left.jpg,laser spot，moderate non proliferative retinopathy,left,train
1,5,50,Female,5_left.jpg,moderate non proliferative retinopathy,left,val
2,11,60,Female,11_left.jpg,moderate non proliferative retinopathy，hyperte...,left,test
3,19,45,Male,19_left.jpg,mild nonproliferative retinopathy,left,train
4,22,55,Female,22_left.jpg,moderate non proliferative retinopathy，laser spot,left,val
...,...,...,...,...,...,...,...
1815,4682,45,Male,4682_right.jpg,moderate non proliferative retinopathy,right,train
1816,4683,58,Male,4683_right.jpg,mild nonproliferative retinopathy,right,test
1817,4686,63,Male,4686_right.jpg,proliferative diabetic retinopathy,right,train
1818,4688,42,Male,4688_right.jpg,moderate non proliferative retinopathy,right,train


### Dataset without leakage (no patients common in training and validation sets)

In [13]:
diabetes_patient = pd.merge(left_diabetes,right_diabetes, on = ['ID', 'Patient Age', 'Patient Sex'], how = 'outer', suffixes = ['_left', '_right'])
diabetes_patient

Unnamed: 0,ID,Patient Age,Patient Sex,Image_left,Diagnostic Keywords_left,Eye_left,Image_right,Diagnostic Keywords_right,Eye_right
0,2,42,Male,2_left.jpg,laser spot，moderate non proliferative retinopathy,left,2_right.jpg,moderate non proliferative retinopathy,right
1,5,50,Female,5_left.jpg,moderate non proliferative retinopathy,left,5_right.jpg,moderate non proliferative retinopathy,right
2,11,60,Female,11_left.jpg,moderate non proliferative retinopathy，hyperte...,left,11_right.jpg,moderate non proliferative retinopathy，hyperte...,right
3,19,45,Male,19_left.jpg,mild nonproliferative retinopathy,left,19_right.jpg,mild nonproliferative retinopathy,right
4,22,55,Female,22_left.jpg,moderate non proliferative retinopathy，laser spot,left,22_right.jpg,laser spot，moderate non proliferative retinopathy,right
...,...,...,...,...,...,...,...,...,...
1123,4625,70,Female,,,,4625_right.jpg,moderate non proliferative retinopathy,right
1124,4639,60,Female,,,,4639_right.jpg,mild nonproliferative retinopathy,right
1125,4641,56,Male,,,,4641_right.jpg,mild nonproliferative retinopathy,right
1126,4672,65,Male,,,,4672_right.jpg,mild nonproliferative retinopathy,right


In [14]:
diabetes_patient['present_vals'] = 2 - diabetes_patient.isnull().sum(axis = 1) / 3

In [15]:
both_eyes_diabetic = diabetes_patient[diabetes_patient.present_vals == 2]
one_eye_diabetic = diabetes_patient[diabetes_patient.present_vals == 1]

In [16]:
## Splitting into test, train, val
diabetes_train_both, diabetes_test_both = train_test_split(both_eyes_diabetic, random_state = RANDOM_SEED, test_size = TEST_RATIO)
diabetes_train_both, diabetes_val_both = train_test_split(diabetes_train_both, random_state= RANDOM_SEED, test_size = VAL_RATIO)

## Splitting into test, train, val
diabetes_train_one, diabetes_test_one = train_test_split(one_eye_diabetic, random_state = RANDOM_SEED, test_size = TEST_RATIO)
diabetes_train_one, diabetes_val_one = train_test_split(diabetes_train_one, random_state= RANDOM_SEED, test_size = VAL_RATIO)

In [17]:
print(len(diabetes_train_both), len(diabetes_test_both), len(diabetes_val_both))
print(len(diabetes_train_one), len(diabetes_test_one), len(diabetes_val_one))

461 104 127
290 66 80


In [18]:
# get IDs and then split in diabetes df
train_ids = np.concatenate([diabetes_train_both.ID.unique(), diabetes_train_one.ID.unique()])
test_ids  = np.concatenate([diabetes_test_both.ID.unique(), diabetes_test_one.ID.unique()])
val_ids   = np.concatenate([diabetes_val_both.ID.unique(), diabetes_val_one.ID.unique()])


In [19]:
# adding labels to the diabetes dataframe

train_ind = diabetes[diabetes['ID'].isin(train_ids)].index
test_ind = diabetes[diabetes['ID'].isin(test_ids)].index
val_ind  = diabetes[diabetes['ID'].isin(val_ids)].index

train_ind.__len__(), len(test_ind), len(val_ind)

(1212, 274, 334)

In [20]:
diabetes.loc[train_ind, 'split'] = 'train'
diabetes.loc[test_ind, 'split'] = 'test'
diabetes.loc[val_ind, 'split'] = 'val'

diabetes.to_csv(CSV_LOC + "diabetes_no_leakage.csv", index = False)

diabetes

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,2,42,Male,2_left.jpg,laser spot，moderate non proliferative retinopathy,left,train
1,5,50,Female,5_left.jpg,moderate non proliferative retinopathy,left,train
2,11,60,Female,11_left.jpg,moderate non proliferative retinopathy，hyperte...,left,train
3,19,45,Male,19_left.jpg,mild nonproliferative retinopathy,left,train
4,22,55,Female,22_left.jpg,moderate non proliferative retinopathy，laser spot,left,val
...,...,...,...,...,...,...,...
1815,4682,45,Male,4682_right.jpg,moderate non proliferative retinopathy,right,train
1816,4683,58,Male,4683_right.jpg,mild nonproliferative retinopathy,right,train
1817,4686,63,Male,4686_right.jpg,proliferative diabetic retinopathy,right,train
1818,4688,42,Male,4688_right.jpg,moderate non proliferative retinopathy,right,test


## Normal eyes



In [21]:
normal = df[df['N'] == 1]
left_normal = normal[['Patient Age','Patient Sex', 
                      'Left-Fundus', 
                      'Left-Diagnostic Keywords']].reset_index().rename(columns = {"Left-Fundus":"Image",
                                                                        "Left-Diagnostic Keywords" : "Diagnostic Keywords"})
left_normal['Eye'] = 'left'

right_normal = normal[['Patient Age','Patient Sex', 
                      'Right-Fundus', 
                      'Right-Diagnostic Keywords']].reset_index().rename(columns = {"Right-Fundus":"Image",
                                                                        "Right-Diagnostic Keywords" : "Diagnostic Keywords"})
right_normal['Eye'] = 'right'

normal = pd.concat([left_normal, right_normal]).reset_index(drop = True)
normal_patient = pd.merge(left_normal,right_normal, on = ['ID', 'Patient Age', 'Patient Sex'], how = 'outer', suffixes = ['_left', '_right'])
normal['split'] = None
normal

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,1,57,Male,1_left.jpg,normal fundus,left,
1,8,59,Male,8_left.jpg,normal fundus,left,
2,84,51,Female,84_left.jpg,normal fundus,left,
3,191,51,Female,191_left.jpg,normal fundus,left,
4,394,63,Male,394_left.jpg,normal fundus,left,
...,...,...,...,...,...,...,...
2275,3485,70,Male,3485_right.jpg,normal fundus,right,
2276,4149,55,Male,4149_right.jpg,low image quality,right,
2277,4290,51,Male,4290_right.jpg,normal fundus,right,
2278,4571,51,Male,4571_right.jpg,normal fundus,right,


In [22]:
# test train val split
normal_train, normal_test = train_test_split(normal, random_state = RANDOM_SEED, test_size = TEST_RATIO)
normal_train, normal_val = train_test_split(normal_train, random_state= RANDOM_SEED, test_size = VAL_RATIO)

train_ind = normal_train.index
test_ind = normal_test.index
val_ind = normal_val.index

normal.loc[train_ind, 'split'] = 'train'
normal.loc[test_ind, 'split'] = 'test'
normal.loc[val_ind, 'split'] = 'val'

normal

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,1,57,Male,1_left.jpg,normal fundus,left,val
1,8,59,Male,8_left.jpg,normal fundus,left,train
2,84,51,Female,84_left.jpg,normal fundus,left,train
3,191,51,Female,191_left.jpg,normal fundus,left,test
4,394,63,Male,394_left.jpg,normal fundus,left,val
...,...,...,...,...,...,...,...
2275,3485,70,Male,3485_right.jpg,normal fundus,right,train
2276,4149,55,Male,4149_right.jpg,low image quality,right,test
2277,4290,51,Male,4290_right.jpg,normal fundus,right,test
2278,4571,51,Male,4571_right.jpg,normal fundus,right,val


In [23]:
normal.to_csv(CSV_LOC + "normal_images_leakage.csv", index = False)

### Without leakage

In [24]:
normal_patient.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Image_left,Diagnostic Keywords_left,Eye_left,Image_right,Diagnostic Keywords_right,Eye_right
0,1,57,Male,1_left.jpg,normal fundus,left,1_right.jpg,normal fundus,right
1,8,59,Male,8_left.jpg,normal fundus,left,8_right.jpg,normal fundus,right
2,84,51,Female,84_left.jpg,normal fundus,left,84_right.jpg,normal fundus,right
3,191,51,Female,191_left.jpg,normal fundus,left,191_right.jpg,normal fundus,right
4,394,63,Male,394_left.jpg,normal fundus,left,394_right.jpg,normal fundus,right


In [25]:
normal_train, normal_test = train_test_split(normal_patient, random_state = RANDOM_SEED, test_size = TEST_RATIO)
normal_train, normal_val = train_test_split(normal_train, random_state= RANDOM_SEED, test_size = VAL_RATIO)

train_ind = normal_train.ID.unique()
test_ind = normal_test.ID.unique()
val_ind = normal_val.ID.unique()

train_ind = normal[normal['ID'].isin(train_ind)].index
test_ind = normal[normal['ID'].isin(test_ind)].index
val_ind = normal[normal['ID'].isin(val_ind)].index

normal.loc[train_ind, 'split'] = 'train'
normal.loc[test_ind, 'split'] = 'test'
normal.loc[val_ind, 'split'] = 'val'

normal.to_csv(CSV_LOC + "normal_images_no_leakage.csv", index = False)

normal

Unnamed: 0,ID,Patient Age,Patient Sex,Image,Diagnostic Keywords,Eye,split
0,1,57,Male,1_left.jpg,normal fundus,left,train
1,8,59,Male,8_left.jpg,normal fundus,left,train
2,84,51,Female,84_left.jpg,normal fundus,left,train
3,191,51,Female,191_left.jpg,normal fundus,left,val
4,394,63,Male,394_left.jpg,normal fundus,left,train
...,...,...,...,...,...,...,...
2275,3485,70,Male,3485_right.jpg,normal fundus,right,train
2276,4149,55,Male,4149_right.jpg,low image quality,right,train
2277,4290,51,Male,4290_right.jpg,normal fundus,right,val
2278,4571,51,Male,4571_right.jpg,normal fundus,right,train


In [26]:
normal['Diagnostic Keywords'].value_counts()

normal fundus                    2056
lens dust，normal fundus           174
normal fundus，lens dust            43
low image quality                   3
normal fundus，normal fundus         2
lens dust，lens dust，lens dust       1
lens dust，lens dust                 1
Name: Diagnostic Keywords, dtype: int64

In [27]:
len(normal), len(diabetes)

(2280, 1820)