In [1]:
%run preprocess_functions.ipynb

In [2]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Selecting classes

In [109]:
def select_classes(dataset, classes, multi_label=False):
    """
    _summary_

    Args:
        classes (_type_): classes for selecting
        dataset (_type_): input dataset
        multi_label (bool, optional): include multi label or not. Defaults to False.
    """
    all_diseases = dataset.columns.to_list()
    patient_info = ['patient_id', 'image_id', 'patient_age', 'patient_sex', 'exam_eye', 'data_source']
    for column in patient_info:
        all_diseases.remove(column)
    
    all_zero_indecies = dataset[dataset[all_diseases].eq(0).all(axis=1)].index.to_list()
    
    classes_to_drop = [disease for disease in all_diseases if disease not in classes] #selecting and deleting rows where other diseases are present
    inecies_to_drop = dataset[dataset[classes_to_drop].eq(1).any(axis=1)].index.to_list()
    inecies_to_drop.extend(all_zero_indecies) #adding rows where all diseases are 0
    result_dataset = dataset.drop(index=inecies_to_drop)
    result_dataset = result_dataset.drop(columns = classes_to_drop)
    # result_dataset = result_dataset[result_dataset[classes].eq(1).any(axis=1)] # for dropping lines with all 0
    if not multi_label:
        result_dataset = result_dataset[result_dataset[classes].sum(axis=1) == 1]
    return result_dataset

In [102]:
data = pd.read_csv('combined_dataset_ver1.csv')
data

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,macular_edema,scar,nevus,amd,...,drusens,hemorrhage,retinal_detachment,myopic_fundus,increased_cup_disc,other,normal_eye,data_source,glaucoma,cataract
0,img00001.jpg,1,48.0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,BRSET,0,0
1,img00002.jpg,1,48.0,1,2,0,0,0,0,0,...,0,0,0,0,1,0,0,BRSET,0,0
2,img00003.jpg,2,18.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,1,BRSET,0,0
3,img00004.jpg,2,18.0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,1,BRSET,0,0
4,img00005.jpg,3,22.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,BRSET,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16820,4689_right.jpg,13214,54.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,ODIR,0,0
16821,4690_left.jpg,13215,57.0,1,2,1,0,0,0,0,...,0,0,0,0,0,0,0,ODIR,0,0
16822,4690_right.jpg,13215,57.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,ODIR,0,0
16823,4784_left.jpg,13309,58.0,1,2,0,0,0,0,1,...,0,0,0,0,0,0,0,ODIR,0,0


In [115]:
classes=['diabetic_retinopathy', 'amd', 'hypertensive_retinopathy', 'normal_eye', 'glaucoma', 'cataract']
selected_data = select_classes(data, classes)
selected_data[classes].value_counts()

diabetic_retinopathy  amd  hypertensive_retinopathy  normal_eye  glaucoma  cataract
0                     0    0                         1           0         0           7807
1                     0    0                         0           0         0           1990
0                     1    0                         0           0         0            307
                      0    0                         0           0         1            292
                                                                 1         0            260
                           1                         0           0         0            169
Name: count, dtype: int64

# deleting duplicates

In [117]:
dropped_duplicates_dataset = selected_data.drop_duplicates(subset=['exam_eye', 'patient_id'])

In [118]:
display(dropped_duplicates_dataset.shape, selected_data.shape)

(10797, 12)

(10825, 12)

In [119]:
patient_duplicate_ids = selected_data[~(selected_data.index.isin(dropped_duplicates_dataset.index.to_list()))]['patient_id'].to_list()

duplicates

In [120]:
selected_data[selected_data['patient_id'].isin(patient_duplicate_ids)]

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,data_source,glaucoma,cataract
22,img00023.jpg,12,16.0,2,1,0,0,0,1,BRSET,0,0
23,img00024.jpg,12,16.0,2,1,0,0,0,1,BRSET,0,0
553,img00564.jpg,292,65.0,2,1,0,0,0,1,BRSET,0,0
554,img00565.jpg,292,65.0,2,1,0,0,0,1,BRSET,0,0
695,img00724.jpg,375,19.0,1,2,0,0,0,1,BRSET,0,0
696,img00725.jpg,375,19.0,1,2,0,0,0,1,BRSET,0,0
748,img00780.jpg,404,53.0,2,1,0,0,0,1,BRSET,0,0
749,img00781.jpg,404,53.0,2,1,0,0,0,1,BRSET,0,0
817,img00858.jpg,445,33.0,2,1,0,0,0,1,BRSET,0,0
818,img00859.jpg,445,33.0,2,1,0,0,0,1,BRSET,0,0


dropped duplicates

In [121]:
selected_data[~(selected_data.index.isin(dropped_duplicates_dataset.index.to_list()))]

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,data_source,glaucoma,cataract
23,img00024.jpg,12,16.0,2,1,0,0,0,1,BRSET,0,0
554,img00565.jpg,292,65.0,2,1,0,0,0,1,BRSET,0,0
696,img00725.jpg,375,19.0,1,2,0,0,0,1,BRSET,0,0
749,img00781.jpg,404,53.0,2,1,0,0,0,1,BRSET,0,0
818,img00859.jpg,445,33.0,2,1,0,0,0,1,BRSET,0,0
826,img00867.jpg,449,64.0,1,1,0,0,0,1,BRSET,0,0
875,img00917.jpg,474,81.0,2,1,0,0,0,1,BRSET,0,0
1390,img01849.jpg,972,63.0,2,1,0,0,0,1,BRSET,0,0
1399,img01867.jpg,982,70.0,1,1,0,0,1,0,BRSET,0,0
1431,img01916.jpg,1010,40.0,2,1,0,0,0,1,BRSET,0,0


In [124]:
dropped_duplicates_dataset

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,data_source,glaucoma,cataract
2,img00003.jpg,2,18.0,2,1,0,0,0,1,BRSET,0,0
3,img00004.jpg,2,18.0,2,2,0,0,0,1,BRSET,0,0
4,img00005.jpg,3,22.0,1,1,0,0,0,1,BRSET,0,0
5,img00006.jpg,3,22.0,1,2,0,0,0,1,BRSET,0,0
6,img00007.jpg,4,22.0,1,1,0,0,0,1,BRSET,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16818,4688_right.jpg,13213,42.0,1,1,1,0,0,0,ODIR,0,0
16819,4689_left.jpg,13214,54.0,1,2,1,0,0,0,ODIR,0,0
16820,4689_right.jpg,13214,54.0,1,1,0,0,0,1,ODIR,0,0
16821,4690_left.jpg,13215,57.0,1,2,1,0,0,0,ODIR,0,0


# case of row structure: patient info, image, exam eye info, diagnosis

In [133]:
CASE_1_OUTPUT_FILENAME = "one_eye_dataset_ver1.csv"
dropped_duplicates_dataset.to_csv(CASE_1_OUTPUT_FILENAME, index=False)

# case of row structure: left image, right image, patient info, left eye diagnosis, right eye diagnosis

In [125]:
dropped_duplicates_dataset["exam_eye"].value_counts()

exam_eye
2    5438
1    5359
Name: count, dtype: int64

creating two datasets for each eye and joining them on patient id, so each row consists of patient_id, their age, sex and image ids for both eyes

In [126]:
left_eye = dropped_duplicates_dataset[dropped_duplicates_dataset["exam_eye"] == 1]
right_eye = dropped_duplicates_dataset[dropped_duplicates_dataset["exam_eye"] == 2]
dataset_nn_v_2 = pd.merge(left_eye, right_eye, on='patient_id', how='inner', suffixes=['_left', '_right'])
dataset_nn_v_2.drop(columns=['patient_age_right', 'patient_sex_right', 'data_source_left', 'exam_eye_left', 'exam_eye_right'], inplace=True)
dataset_nn_v_2.sample(10)

Unnamed: 0,image_id_left,patient_id,patient_age_left,patient_sex_left,diabetic_retinopathy_left,amd_left,hypertensive_retinopathy_left,normal_eye_left,glaucoma_left,cataract_left,image_id_right,diabetic_retinopathy_right,amd_right,hypertensive_retinopathy_right,normal_eye_right,data_source_right,glaucoma_right,cataract_right
1680,img15098.jpg,7917,58.0,2,0,0,0,1,0,0,img15099.jpg,0,0,0,1,BRSET,0,0
511,img04506.jpg,2389,53.0,2,0,0,0,1,0,0,img04507.jpg,0,0,0,1,BRSET,0,0
4002,4270_right.jpg,12795,53.0,2,1,0,0,0,0,0,4270_left.jpg,1,0,0,0,ODIR,0,0
1634,img14753.jpg,7740,65.0,2,0,0,0,1,0,0,img14754.jpg,0,0,0,1,BRSET,0,0
3670,3381_right.jpg,11906,65.0,2,0,0,0,1,0,0,3381_left.jpg,0,0,0,1,ODIR,0,0
3503,3213_right.jpg,11738,65.0,1,0,0,0,1,0,0,3213_left.jpg,0,0,0,1,ODIR,0,0
2663,2372_right.jpg,10897,80.0,2,0,0,0,1,0,0,2372_left.jpg,0,0,0,1,ODIR,0,0
1550,img14084.jpg,7402,63.0,2,0,0,0,1,0,0,img14085.jpg,0,0,0,1,BRSET,0,0
79,img00201.jpg,102,16.0,1,0,0,0,1,0,0,img00202.jpg,0,0,0,1,BRSET,0,0
2630,2339_right.jpg,10864,35.0,1,0,0,0,1,0,0,2339_left.jpg,0,0,0,1,ODIR,0,0


patient ids that have been omitted

In [127]:
only_in_df1 = dropped_duplicates_dataset[~dropped_duplicates_dataset['patient_id'].isin(dataset_nn_v_2['patient_id'])]
only_in_df1

Unnamed: 0,image_id,patient_id,patient_age,patient_sex,exam_eye,diabetic_retinopathy,amd,hypertensive_retinopathy,normal_eye,data_source,glaucoma,cataract
18,img00019.jpg,10,37.0,1,1,0,0,0,1,BRSET,0,0
22,img00023.jpg,12,16.0,2,1,0,0,0,1,BRSET,0,0
25,img00026.jpg,13,24.0,2,2,1,0,0,0,BRSET,0,0
28,img00029.jpg,15,26.0,2,1,0,0,0,1,BRSET,0,0
32,img00033.jpg,17,24.0,1,1,0,0,0,1,BRSET,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
16587,4551_left.jpg,13076,53.0,1,2,1,0,0,0,ODIR,0,0
16642,4580_right.jpg,13105,68.0,1,1,1,0,0,0,ODIR,0,0
16657,4588_left.jpg,13113,64.0,2,2,1,0,0,0,ODIR,0,0
16683,4601_left.jpg,13126,36.0,1,2,1,0,0,0,ODIR,0,0


In [128]:
dataset_nn_v_2.rename(columns={"patient_age_left":"patient_age", "patient_sex_left": "patient_sex", "data_source_right": "data_source"}, errors="raise", inplace=True)
dataset_nn_v_2 = dataset_nn_v_2[['patient_id', 'patient_age', 'patient_sex', 'data_source', 
                             'image_id_left',
                             'diabetic_retinopathy_left', 'amd_left',
                             'hypertensive_retinopathy_left', 'normal_eye_left', 'glaucoma_left', 'cataract_left', 
                             'image_id_right',
                             'diabetic_retinopathy_right', 'amd_right',
                             'hypertensive_retinopathy_right', 'normal_eye_right', 'glaucoma_right',
                             'cataract_right']]

In [129]:
dataset_nn_v_2.head()

Unnamed: 0,patient_id,patient_age,patient_sex,data_source,image_id_left,diabetic_retinopathy_left,amd_left,hypertensive_retinopathy_left,normal_eye_left,glaucoma_left,cataract_left,image_id_right,diabetic_retinopathy_right,amd_right,hypertensive_retinopathy_right,normal_eye_right,glaucoma_right,cataract_right
0,2,18.0,2,BRSET,img00003.jpg,0,0,0,1,0,0,img00004.jpg,0,0,0,1,0,0
1,3,22.0,1,BRSET,img00005.jpg,0,0,0,1,0,0,img00006.jpg,0,0,0,1,0,0
2,4,22.0,1,BRSET,img00007.jpg,0,0,0,1,0,0,img00008.jpg,0,0,0,1,0,0
3,5,23.0,1,BRSET,img00009.jpg,0,0,0,1,0,0,img00010.jpg,0,0,0,1,0,0
4,6,14.0,1,BRSET,img00011.jpg,0,0,0,1,0,0,img00012.jpg,0,0,0,1,0,0


writing to csv

In [132]:
CASE_2_OUPUT_FILENAME = 'two_eye_dataset_ver2.csv'
dataset_nn_v_2.to_csv(CASE_2_OUPUT_FILENAME, index=False) 

# Adding images from the dataset to directory

In [135]:
def directory_creation(directory_name, dataset, key='one_image_in_row'): #key can be "two_images_in_row"
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)
    if key == 'one_image_in_row':
        for _, row in dataset.iterrows():
            image_id = row['image_id']
            data_source = row['data_source']
            image_path = find_img_path(image_id, data_source)
            img = cv2.imread(image_path)
            cv2.imwrite(os.path.join(directory_name, image_id), img)
    else:
        for _, row in dataset.iterrows():
            left_image_id = row['image_id_left']
            right_image_id = row['image_id_right']
            data_source = row['data_source']
            left_image_path = find_img_path(left_image_id, data_source)
            right_image_path = find_img_path(right_image_id, data_source)
            left_image = cv2.imread(left_image_path)
            right_image = cv2.imread(right_image_path)
            cv2.imwrite(os.path.join(directory_name, left_image_id), left_image)
            cv2.imwrite(os.path.join(directory_name, right_image_id), right_image)
    

In [148]:
data_for_directory = pd.read_csv("one_eye_dataset_ver1.csv")
odir_data = data_for_directory[data_for_directory['data_source'] == 'ODIR']
brset_data = data_for_directory[data_for_directory['data_source'] == 'BRSET']
odir_data.to_csv("odir_data_only_one_eye.csv", index=False)
brset_data.to_csv("brset_data_only_one_eye.csv", index=False)

In [150]:
odir_directory_name = "odir_images_one_eye"
directory_creation(odir_directory_name, odir_data)

In [None]:
brset_directory_name = "brset_images_one_eye"
directory_creation(brset_directory_name, brset_data)

In [None]:
data_1 = pd.read_csv("combined_data_one_eye.csv")
directory_name = "one_eye_images_ver1"
directory_creation(directory_name, data_1)

# Adding preprocessed images to the directory

In [None]:
dataset = pd.read_csv("final_one_eye_dataset.csv")
initial_directory_name = "one_eye_images"
destination_directory_name = "preprocessed_images_v4"

if not os.path.exists(destination_directory_name):
    os.makedirs(destination_directory_name)
for _, row in dataset.iterrows():
    image = cv2.imread(os.path.join(initial_directory_name, row['image_id']))
    processed_image = preprocess(image)
    cv2.imwrite(os.path.join(destination_directory_name, row["image_id"]), processed_image)
