In [1]:
%run preprocess_functions.ipynb

In [2]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Selecting classes

In [3]:
data = pd.read_csv('combined_dataset.csv')
data

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,macular_edema,scar,nevus,amd,...,hypertensive_retinopathy,drusens,hemorrhage,retinal_detachment,myopic_fundus,other,data_source,normal_eye,glaucoma,cataract
0,img00001.jpg,1,48.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,BRSET,0,0,0
1,img00002.jpg,1,48.0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,BRSET,0,0,0
2,img00003.jpg,2,18.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,BRSET,0,0,0
3,img00004.jpg,2,18.0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,BRSET,0,0,0
4,img00005.jpg,3,22.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,BRSET,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17783,4689_right.jpg,13214,54.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,ODIR,1,0,0
17784,4690_left.jpg,13215,57.0,1,2,1,0,0,0,0,...,0,0,0,0,0,0,ODIR,0,0,0
17785,4690_right.jpg,13215,57.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,ODIR,0,0,0
17786,4784_left.jpg,13309,58.0,1,2,0,0,0,0,1,...,1,0,0,0,0,0,ODIR,0,0,0


In [4]:
data.columns

Index(['image_id', 'patient_id', 'patient_age', 'patient_sex', 'exam_eye',
       'diabetic_retinopathy', 'macular_edema', 'scar', 'nevus', 'amd',
       'vascular_occlusion', 'hypertensive_retinopathy', 'drusens',
       'hemorrhage', 'retinal_detachment', 'myopic_fundus', 'other',
       'data_source', 'normal_eye', 'glaucoma', 'cataract'],
      dtype='object')

In [5]:
display(data[['diabetic_retinopathy', 'macular_edema', 'scar', 'nevus', 'amd',
       'vascular_occlusion', 'hypertensive_retinopathy', 'drusens',
       'hemorrhage', 'retinal_detachment', 'myopic_fundus', 'other', 
       'normal_eye', 'glaucoma', 'cataract']].value_counts())

diabetic_retinopathy  macular_edema  scar  nevus  amd  vascular_occlusion  hypertensive_retinopathy  drusens  hemorrhage  retinal_detachment  myopic_fundus  other  normal_eye  glaucoma  cataract
0                     0              0     0      0    0                   0                         0        0           0                   0              0      0           0         0           6996
                                                                                                                                                                    1           0         0           3094
1                     0              0     0      0    0                   0                         0        0           0                   0              0      0           0         0           2050
0                     0              0     0      0    0                   0                         1        0           0                   0              0      0           0         0         

In [6]:
dataset_nn = data[['image_id', 'patient_id', 'patient_age', 'patient_sex', 'exam_eye', 'data_source',
                     'diabetic_retinopathy', 'amd', 'hypertensive_retinopathy', 'normal_eye', 'glaucoma', 'cataract']]
classes = ['diabetic_retinopathy', 'amd', 'hypertensive_retinopathy', 'normal_eye', 'glaucoma', 'cataract']

In [7]:
display(dataset_nn[classes].value_counts())

diabetic_retinopathy  amd  hypertensive_retinopathy  normal_eye  glaucoma  cataract
0                     0    0                         0           0         0           10815
                                                     1           0         0            3094
1                     0    0                         0           0         0            2378
0                     1    0                         0           0         0             468
                      0    0                         0           0         1             292
                           1                         0           0         0             279
                           0                         0           1         0             268
1                     0    1                         0           0         0              87
                           0                         0           1         0              34
                      1    0                         0           0         0   

In [8]:
#delete rows where all classes are zero
dataset_nn = dataset_nn[dataset_nn.iloc[:, 6:12].eq(1).any(axis=1)]

In [9]:
display(dataset_nn[classes].value_counts())

diabetic_retinopathy  amd  hypertensive_retinopathy  normal_eye  glaucoma  cataract
0                     0    0                         1           0         0           3094
1                     0    0                         0           0         0           2378
0                     1    0                         0           0         0            468
                      0    0                         0           0         1            292
                           1                         0           0         0            279
                           0                         0           1         0            268
1                     0    1                         0           0         0             87
                           0                         0           1         0             34
                      1    0                         0           0         0             23
                      0    0                         0           0         1            

# deleting duplicates

In [10]:
dropped_duplicates_dataset = dataset_nn.drop_duplicates(subset=['exam_eye', 'patient_id'])

In [11]:
display(dropped_duplicates_dataset.shape, dataset_nn.shape)

(6969, 12)

(6973, 12)

In [12]:
patient_duplicate_ids = dataset_nn[~(dataset_nn.index.isin(dropped_duplicates_dataset.index.to_list()))]['patient_id'].to_list()

duplicates

In [13]:
dataset_nn[dataset_nn['patient_id'].isin(patient_duplicate_ids)]

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,data_source,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,glaucoma,cataract
1547,img01866.jpg,982,70.0,1,1,BRSET,0,0,1,0,0,0
1548,img01867.jpg,982,70.0,1,1,BRSET,0,0,1,0,0,0
2708,img03675.jpg,1952,64.0,1,2,BRSET,0,0,1,0,0,0
2709,img03676.jpg,1952,64.0,1,2,BRSET,0,0,1,0,0,0
3406,img04819.jpg,2547,41.0,2,2,BRSET,1,0,0,0,0,0
3407,img04820.jpg,2547,41.0,2,2,BRSET,1,0,0,0,0,0
9278,img14137.jpg,7429,72.0,2,1,BRSET,1,0,0,0,0,0
9279,img14138.jpg,7429,72.0,2,1,BRSET,1,0,0,0,0,0


dropped duplicates

In [14]:
dataset_nn[~(dataset_nn.index.isin(dropped_duplicates_dataset.index.to_list()))]

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,data_source,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,glaucoma,cataract
1548,img01867.jpg,982,70.0,1,1,BRSET,0,0,1,0,0,0
2709,img03676.jpg,1952,64.0,1,2,BRSET,0,0,1,0,0,0
3407,img04820.jpg,2547,41.0,2,2,BRSET,1,0,0,0,0,0
9279,img14138.jpg,7429,72.0,2,1,BRSET,1,0,0,0,0,0


In [15]:
dataset_nn = dropped_duplicates_dataset

# dropping rows with low value count and multi-labeled

In [16]:
sum_ones = dataset_nn[classes].sum(axis=1)
mask = sum_ones <= 1
filtered_data = dataset_nn[mask]
filtered_data[classes].value_counts()
dataset_nn = filtered_data

# case of row structure: patient info, image, exam eye info, diagnosis

In [17]:
CASE_1_OUTPUT_FILENAME = "final_one_eye_dataset.csv"
dataset_nn.to_csv(CASE_1_OUTPUT_FILENAME, index=False)

# case of row structure: left image, right image, patient info, left eye diagnosis, right eye diagnosis

In [18]:
dataset_nn["exam_eye"].value_counts()

exam_eye
2    3402
1    3373
Name: count, dtype: int64

creating two datasets for each eye and joining them on patient id, so each row consists of patient_id, their age, sex and image ids for both eyes

In [19]:
left_eye = dataset_nn[dataset_nn["exam_eye"] == 1]
right_eye = dataset_nn[dataset_nn["exam_eye"] == 2]
dataset_nn_v_2 = pd.merge(left_eye, right_eye, on='patient_id', how='inner', suffixes=['_left', '_right'])
dataset_nn_v_2.drop(columns=['patient_age_right', 'patient_sex_right', 'data_source_left', 'exam_eye_left', 'exam_eye_right'], inplace=True)
dataset_nn_v_2.sample(10)

Unnamed: 0,image_id_left,patient_id,patient_age_left,patient_sex_left,diabetic_retinopathy_left,amd_left,hypertensive_retinopathy_left,normal_eye_left,glaucoma_left,cataract_left,image_id_right,data_source_right,diabetic_retinopathy_right,amd_right,hypertensive_retinopathy_right,normal_eye_right,glaucoma_right,cataract_right
2345,3994_right.jpg,12519,70.0,2,1,0,0,0,0,0,3994_left.jpg,ODIR,1,0,0,0,0,0
1370,2514_right.jpg,11039,62.0,1,0,0,0,1,0,0,2514_left.jpg,ODIR,0,0,0,1,0,0
939,1413_right.jpg,9938,68.0,2,0,0,0,0,1,0,1413_left.jpg,ODIR,1,0,0,0,0,0
493,161_right.jpg,8686,49.0,1,1,0,0,0,0,0,161_left.jpg,ODIR,1,0,0,0,0,0
974,1478_right.jpg,10003,35.0,1,0,0,0,1,0,0,1478_left.jpg,ODIR,0,0,0,0,1,0
626,533_right.jpg,9058,69.0,1,1,0,0,0,0,0,533_left.jpg,ODIR,1,0,0,0,0,0
397,img14613.jpg,7669,50.0,2,1,0,0,0,0,0,img14614.jpg,BRSET,1,0,0,0,0,0
882,1259_right.jpg,9784,56.0,1,0,0,0,0,1,0,1259_left.jpg,ODIR,0,0,0,0,1,0
1446,2590_right.jpg,11115,50.0,2,0,0,0,1,0,0,2590_left.jpg,ODIR,0,0,0,1,0,0
649,578_right.jpg,9103,42.0,1,1,0,0,0,0,0,578_left.jpg,ODIR,1,0,0,0,0,0


patient ids that have been omitted

In [20]:
only_in_df1 = dataset_nn[~dataset_nn['patient_id'].isin(dataset_nn_v_2['patient_id'])]
only_in_df1

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,data_source,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,glaucoma,cataract
69,img00070.jpg,36,39.0,1,1,BRSET,1,0,0,0,0,0
71,img00072.jpg,37,20.0,1,1,BRSET,1,0,0,0,0,0
81,img00082.jpg,42,23.0,2,1,BRSET,1,0,0,0,0,0
149,img00150.jpg,76,19.0,1,1,BRSET,1,0,0,0,0,0
167,img00168.jpg,85,24.0,2,1,BRSET,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
17550,4551_left.jpg,13076,53.0,1,2,ODIR,1,0,0,0,0,0
17605,4580_right.jpg,13105,68.0,1,1,ODIR,1,0,0,0,0,0
17620,4588_left.jpg,13113,64.0,2,2,ODIR,1,0,0,0,0,0
17646,4601_left.jpg,13126,36.0,1,2,ODIR,1,0,0,0,0,0


In [21]:
dataset_nn_v_2.rename(columns={"patient_age_left":"patient_age", "patient_sex_left": "patient_sex", "data_source_right": "data_source"}, errors="raise", inplace=True)
dataset_nn_v_2 = dataset_nn_v_2[['patient_id', 'patient_age', 'patient_sex', 'data_source', 
                             'image_id_left',
                             'diabetic_retinopathy_left', 'amd_left',
                             'hypertensive_retinopathy_left', 'normal_eye_left', 'glaucoma_left', 'cataract_left', 
                             'image_id_right',
                             'diabetic_retinopathy_right', 'amd_right',
                             'hypertensive_retinopathy_right', 'normal_eye_right', 'glaucoma_right',
                             'cataract_right']]

In [22]:
dataset_nn_v_2.head()

Unnamed: 0,patient_id,patient_age,patient_sex,data_source,image_id_left,diabetic_retinopathy_left,amd_left,hypertensive_retinopathy_left,normal_eye_left,glaucoma_left,cataract_left,image_id_right,diabetic_retinopathy_right,amd_right,hypertensive_retinopathy_right,normal_eye_right,glaucoma_right,cataract_right
0,11,21.0,2,BRSET,img00021.jpg,1,0,0,0,0,0,img00022.jpg,1,0,0,0,0,0
1,13,24.0,2,BRSET,img00025.jpg,1,0,0,0,0,0,img00026.jpg,1,0,0,0,0,0
2,25,28.0,2,BRSET,img00049.jpg,1,0,0,0,0,0,img00050.jpg,1,0,0,0,0,0
3,31,27.0,1,BRSET,img00060.jpg,1,0,0,0,0,0,img00061.jpg,1,0,0,0,0,0
4,38,37.0,1,BRSET,img00074.jpg,1,0,0,0,0,0,img00075.jpg,1,0,0,0,0,0


writing to csv

In [23]:
CASE_2_OUPUT_FILENAME = 'final_two_eye_dataset.csv'
dataset_nn_v_2.to_csv(CASE_2_OUPUT_FILENAME, index=False) 

# Adding images from the dataset to directory

In [38]:
def directory_creation(directory_name, dataset, key='one_image_in_row'): #key can be "two_images_in_row"
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
    if key == 'one_image_in_row':
        for _, row in dataset.iterrows():
            image_id = row['image_id']
            data_source = row['data_source']
            image_path = find_img_path(image_id, data_source)
            img = cv2.imread(image_path)
            cv2.imwrite(os.path.join(directory_name, image_id), img)
    else:
        for _, row in dataset.iterrows():
            left_image_id = row['image_id_left']
            right_image_id = row['image_id_right']
            data_source = row['data_source']
            left_image_path = find_img_path(left_image_id, data_source)
            right_image_path = find_img_path(right_image_id, data_source)
            left_image = cv2.imread(left_image_path)
            right_image = cv2.imread(right_image_path)
            cv2.imwrite(os.path.join(directory_name, left_image_id), left_image)
            cv2.imwrite(os.path.join(directory_name, right_image_id), right_image)
    

In [40]:
data_1 = pd.read_csv("final_one_eye_dataset.csv")
directory_name = "one_eye_images"
directory_creation(directory_name, data_1)

# Adding preprocessed images to the directory

In [41]:
dataset = pd.read_csv("final_one_eye_dataset.csv")
initial_directory_name = "one_eye_images"
destination_directory_name = "preprocessed_images_v4"

if not os.path.exists(destination_directory_name):
    os.makedirs(destination_directory_name)
for _, row in dataset.iterrows():
    image = cv2.imread(os.path.join(initial_directory_name, row['image_id']))
    processed_image = preprocess(image)
    cv2.imwrite(os.path.join(destination_directory_name, row["image_id"]), processed_image)


: 