# Data Preprocessing

Dataset link: https://github.com/nkicsl/OIA-ODIR?tab=readme-ov-file

Original Data Information:

A structured ophthalmic database of 5,000 patients with age, color fundus photographs from left and right eyes and doctors' diagnostic keywords from doctors (in short, ODIR). This dataset is ‘‘real-life’’ set of patient information collected by Shanggong Medical Technology Co., Ltd. from different hospitals/medical centers in China. In these institutions, fundus images are captured by various cameras in the market, such as Canon, Zeiss and Kowa, resulting into varied image resolutions. Patient identifying information will be removed. Annotations are labeled by trained human readers with quality control management. They classify patient into eight labels including normal (N), diabetes (D), glaucoma (G), cataract (C), AMD (A), hypertension (H), myopia (M) and other diseases/abnormalities (O) based on both eye images and additionally patient age.

## Image-Only Input Data Preprocessing

In [1]:
import numpy as np
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from skimage import io
import torch
from skimage import color
from torchvision import transforms
import torchvision.models as models
import ast
import time
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import cohen_kappa_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Load dataframes for train, validation and test set
train_info = pd.read_csv(r'./OIA-ODIR/Training Set/Annotation/training annotation (English).csv')
valid_info = pd.read_csv(r'./OIA-ODIR/Off-site Test Set/Annotation/off-site test annotation (English).csv')
test_info = pd.read_csv(r'./OIA-ODIR/On-site Test Set/Annotation/on-site test annotation (English).csv')

print('Training Set Size: ', len(train_info))
print('Validation Set Size: ', len(valid_info))
print('Test Set Size: ', len(test_info))

Training Set Size:  3500
Validation Set Size:  500
Test Set Size:  1000


In [3]:
# Convert Classifications to Binary Vector
class_columns = ['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']
train_info['Label'] = train_info[class_columns].apply(lambda row: row.to_numpy().tolist(), axis=1)
valid_info['Label'] = valid_info[class_columns].apply(lambda row: row.to_numpy().tolist(), axis=1)
test_info['Label'] = test_info[class_columns].apply(lambda row: row.to_numpy().tolist(), axis=1)

# Drop Diagnostic Keywords
train_info = train_info.drop(columns=['Left-Diagnostic Keywords', 'Right-Diagnostic Keywords'])
valid_info = valid_info.drop(columns=['Left-Diagnostic Keywords', 'Right-Diagnostic Keywords'])
test_info = test_info.drop(columns=['Left-Diagnostic Keywords', 'Right-Diagnostic Keywords'])

In [4]:
# Class EDA
class_counts_df = pd.DataFrame(columns=class_columns, index=['Train', 'Valid', 'Test'])

for c in class_columns:
    class_counts_df.loc['Train', c] = np.sum(train_info[c])
    class_counts_df.loc['Valid', c] = np.sum(valid_info[c])
    class_counts_df.loc['Test', c] = np.sum(test_info[c])

# Display the DataFrame
print(class_counts_df)

          N     D    G    C    A    H    M    O
Train  1140  1128  215  212  164  103  174  979
Valid   162   163   32   31   25   16   23  136
Test    324   327   58   65   49   30   46  275


In [5]:
train_three_class_count = sum(np.sum(label) >= 3 for label in train_info['Label'])
print("Train - Instances with 3 classes:", train_three_class_count)

valid_three_class_count = sum(np.sum(label) >= 3 for label in valid_info['Label'])
print("Valid - Instances with 3 classes:", valid_three_class_count)

test_three_class_count = sum(np.sum(label) >= 3 for label in test_info['Label'])
print("Test - Instances with 3 classes:", test_three_class_count)

Train - Instances with 3 classes: 29
Valid - Instances with 3 classes: 5
Test - Instances with 3 classes: 8


In [6]:
# Drop instances with 3 classes or more
train_indices_to_drop = [idx for idx, label in enumerate(train_info['Label']) if np.sum(label) >= 3]
train_info_filtered = train_info.drop(train_indices_to_drop)
train_info_filtered = train_info_filtered.reset_index(drop=True)

valid_indices_to_drop = [idx for idx, label in enumerate(valid_info['Label']) if np.sum(label) >= 3]
valid_info_filtered = valid_info.drop(valid_indices_to_drop)
valid_info_filtered = valid_info_filtered.reset_index(drop=True)

test_indices_to_drop = [idx for idx, label in enumerate(test_info['Label']) if np.sum(label) >= 3]
test_info_filtered = test_info.drop(test_indices_to_drop)
test_info_filtered = test_info_filtered.reset_index(drop=True)

# Save new dfs
train_info_filtered.to_csv(r'./OIA-ODIR/Training Set/Annotation/training_annotation_filtered.csv', index=False)
valid_info_filtered.to_csv(r'./OIA-ODIR/Off-site Test Set/Annotation/validation_annotation_filtered.csv', index=False)
test_info_filtered.to_csv(r'./OIA-ODIR/On-site Test Set/Annotation/testing_annotation_filtered.csv', index=False)

In [7]:
# Class EDA post 3 class instance drop
class_counts_df = pd.DataFrame(columns=class_columns, index=['Train', 'Valid', 'Test'])

for c in class_columns:
    class_counts_df.loc['Train', c] = np.sum(train_info_filtered[c])
    class_counts_df.loc['Valid', c] = np.sum(valid_info_filtered[c])
    class_counts_df.loc['Test', c] = np.sum(test_info_filtered[c])

# Display the DataFrame
print(class_counts_df)

          N     D    G    C    A   H    M    O
Train  1140  1105  200  203  161  99  168  952
Valid   162   160   30   30   22  15   23  131
Test    324   320   56   63   45  27   46  269


In [8]:
# Set csv file paths
train_info_path = r'./OIA-ODIR/Training Set/Annotation/training_annotation_filtered.csv'
valid_info_path = r'./OIA-ODIR/Off-site Test Set/Annotation/validation_annotation_filtered.csv'
test_info_path = r'./OIA-ODIR/On-site Test Set/Annotation/testing_annotation_filtered.csv'

In [9]:
# Create random subset for hyperparameter tuning
train_df = pd.read_csv(train_info_path)
valid_df = pd.read_csv(valid_info_path)
subset_ratio = 0.05

train_subset = train_df.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=subset_ratio, random_state=42))
valid_subset = valid_df.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=subset_ratio, random_state=42))

print('Length of Train subset: ', len(train_subset))
print('Length of Validation subset: ', len(valid_subset))

train_subset.to_csv(r"./OIA-ODIR/Training Set/Annotation/train_subset.csv", index=False)
valid_subset.to_csv(r"./OIA-ODIR/Off-site Test Set/Annotation/valid_subset.csv", index=False)

Length of Train subset:  173
Length of Validation subset:  23


  train_subset = train_df.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=subset_ratio, random_state=42))
  valid_subset = valid_df.groupby('Label', group_keys=False).apply(lambda x: x.sample(frac=subset_ratio, random_state=42))


## Image + Tabular Data Preprocessing

In [10]:
train = r'./OIA-ODIR/Training Set/Annotation/training_annotation_filtered.csv'
valid = r'./OIA-ODIR/Off-site Test Set/Annotation/validation_annotation_filtered.csv'
test = r'./OIA-ODIR/On-site Test Set/Annotation/testing_annotation_filtered.csv'

In [11]:
train_df = pd.read_csv(train)
valid_df = pd.read_csv(valid)
test_df = pd.read_csv(test)

print('Train:')
print(train_df['Patient Sex'].value_counts())
print()
print('Validation:')
print(valid_df['Patient Sex'].value_counts())
print()
print('Test:')
print(test_df['Patient Sex'].value_counts())

Train:
Patient Sex
Male      1869
Female    1602
Name: count, dtype: int64

Validation:
Patient Sex
Male      266
Female    229
Name: count, dtype: int64

Test:
Patient Sex
Male      532
Female    460
Name: count, dtype: int64


In [12]:
# 0 = Female, 1 = Male
train_df['Patient Sex'] = pd.get_dummies(train_df['Patient Sex'], drop_first=True).astype(int)
valid_df['Patient Sex'] = pd.get_dummies(valid_df['Patient Sex'], drop_first=True).astype(int)
test_df['Patient Sex'] = pd.get_dummies(test_df['Patient Sex'], drop_first=True).astype(int)

train_df

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,N,D,G,C,A,H,M,O,Label
0,0,69,0,0_left.jpg,0_right.jpg,0,0,0,1,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0]"
1,1,57,1,1_left.jpg,1_right.jpg,1,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0]"
2,2,42,1,2_left.jpg,2_right.jpg,0,1,0,0,0,0,0,1,"[0, 1, 0, 0, 0, 0, 0, 1]"
3,3,66,1,3_left.jpg,3_right.jpg,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 1]"
4,4,53,1,4_left.jpg,4_right.jpg,0,1,0,0,0,0,0,1,"[0, 1, 0, 0, 0, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,4686,63,1,4686_left.jpg,4686_right.jpg,0,1,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0]"
3467,4688,42,1,4688_left.jpg,4688_right.jpg,0,1,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0]"
3468,4689,54,1,4689_left.jpg,4689_right.jpg,0,1,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0]"
3469,4690,57,1,4690_left.jpg,4690_right.jpg,0,1,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0]"


In [13]:
# save new csv: one hot encoded - OHE
train_df.to_csv(r'./OIA-ODIR/Training Set/Annotation/training_annotation_filtered_OHE.csv', index=False)
valid_df.to_csv(r'./OIA-ODIR/Off-site Test Set/Annotation/validation_annotation_filtered_OHE.csv', index=False)
test_df.to_csv(r'./OIA-ODIR/On-site Test Set/Annotation/testing_annotation_filtered_OHE.csv', index=False)