## Imports

In [15]:
import numpy as np                  # numpy, model for data manipulation  
import pandas as pd                 # pandas, model for data manipulation
import os, random                   # os, model for operating system
import matplotlib.pyplot as plt     # matplotlib, model for data visualization
import seaborn as sns               # seaborn, model for data visualization
import tensorflow as tf             # tensorflow, model knowed for deep learning
import keras                        # keras, model knowed for deep learning

## Read Csv

The data is loaded in a external disk and the labels are stored in the csv file.
The main point of this notebook is to create a directory where the images are grouped by class

In [45]:
#File Path
file_path = r"C:\Users\Maria\OneDrive\Ambiente de Trabalho\labels.csv"

#Data frame to store the data from the file
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,image_id,patient_id,camera,patient_age,comorbidities,diabetes_time_y,insuline,patient_sex,exam_eye,diabetes,...,amd,vascular_occlusion,hypertensive_retinopathy,drusens,hemorrhage,retinal_detachment,myopic_fundus,increased_cup_disc,other,quality
0,img00001,1,Canon CR,48.0,diabetes1,12,yes,1,1,yes,...,0,0,0,0,0,0,0,1,0,Adequate
1,img00002,1,Canon CR,48.0,diabetes1,12,yes,1,2,yes,...,0,0,0,0,0,0,0,1,0,Adequate
2,img00003,2,Canon CR,18.0,diabetes1,7,yes,2,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate
3,img00004,2,Canon CR,18.0,diabetes1,7,yes,2,2,yes,...,0,0,0,0,0,0,0,0,0,Adequate
4,img00005,3,Canon CR,22.0,diabetes1,11,yes,1,1,yes,...,0,0,0,0,0,0,0,0,0,Adequate


In [59]:
print("Images in data frame (csv) = " + str(df.shape[0]))
print ("Images in data folder = 16112")

Images in data frame (csv) = 16266
Images in data folder = 16112


## Dataframe augmentation

In [39]:
#Get needed columns 
#image_id, increased_cup_disc, quality
df = df[['image_id', 'increased_cup_disc', 'quality']]

#Get the count of eache value of quality
print(df['quality'].value_counts())

#14279 Adequate Quality
#1987 Inadequate Quality - not used for model training
df=df[df.quality != "Inadequate"]
print("\nAfter removing inadequate quality")
print(df['quality'].value_counts())


Adequate    14279
Name: quality, dtype: int64

After removing inadequate quality
Adequate    14279
Name: quality, dtype: int64


## Create directiory of images

In [40]:
#Get the count of eache value of increased_cup_disc
print(df['increased_cup_disc'].value_counts())

# 0 - Normal
# 1 - Glaucoma
# Creeate data directory
data_dir = r"D:\Datase_retina_brasil_Jose\Data"

if (os.path.exists(data_dir)):
    print("Directory already exists")
else:
    os.mkdir(data_dir)
    print("Directory created")

#Create a directory for each class
classes = ['Normal', 'Glaucoma']
for c in classes:
    path = os.path.join(data_dir, c)
    if (os.path.exists(path)):
        print("Directory already exists")
    else:
        os.mkdir(path)
        print("Directory created")

0    11439
1     2840
Name: increased_cup_disc, dtype: int64
Directory already exists
Directory already exists
Directory already exists


## Save images in folders

In [42]:
#Directory of images
import shutil


images_dir = r"D:\Datase_retina_brasil_Jose\Dataset\fundus_photos"

#Get the images from the directory
images = os.listdir(images_dir)

#For each image un the directory get the name and the class from the data frame
#Copy the image to the respective directory
#0 - Normal - \Normal
#1 - Glaucoma - \Glaucoma

for image in images:
    #Get the name of the image
    image_name = image.split('.')[0]
    #Get the class of the image
    image_class = df.loc[df['image_id'] == image_name, 'increased_cup_disc']
    
    #If image exist in data frame
    #Get the path of the directory to copy the image
    if (image_name in df['image_id'].values):
        if (image_class.values[0] == 0):
            image_dir = os.path.join(data_dir, "Normal")
        else:
            image_dir = os.path.join(data_dir, "Glaucoma")
        
        #Get the path of the image
        image_path = os.path.join(images_dir, image)
        
        #Verify if the image is already in the directory
        if (os.path.exists(os.path.join(image_dir, image))):
            print("Image already exists")
        else:
            #Copy the image to the directory
            shutil.copy(image_path, image_dir)

Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already exists
Image already

## Verify images in new folder

In [60]:
#Count images in each directory
print("Normal images: ", len(os.listdir(os.path.join(data_dir, "Normal"))))
print("Glaucoma images: ", len(os.listdir(os.path.join(data_dir, "Glaucoma"))))
print("Total images: ", len(os.listdir(os.path.join(data_dir, "Normal"))) + len(os.listdir(os.path.join(data_dir, "Glaucoma"))))

Normal images:  11312
Glaucoma images:  2819
Total images:  14131


## Verify Shape of images

In [76]:
#Get shape of the images 

#Get the path of the images
normal_images = os.listdir(os.path.join(data_dir, "Normal"))
glaucoma_images = os.listdir(os.path.join(data_dir, "Glaucoma"))

image_sizes = dict()

for image in normal_images:
    normal_image_path = os.path.join(data_dir, "Normal", image)
    #Get the shape of the image
    normal_image = plt.imread(normal_image_path)
    if len(image_sizes.keys())==0 or normal_image.shape not in image_sizes.keys():
        image_sizes.update(
            {
                normal_image.shape : 1
            }
        )
    else:
        for key in image_sizes.keys():
            if key == normal_image.shape:
                image_sizes[key] += 1
                break

for images in glaucoma_images:
    glaucoma_image_path = os.path.join(data_dir, "Glaucoma", image)
    #Get the shape of the image
    glaucoma_image = plt.imread(glaucoma_image_path)
    if len(image_sizes.keys())==0 or glaucoma_image.shape not in image_sizes.keys():
        image_sizes.update(
            {
                glaucoma_image.shape : 1
            }
        )
    else:
        for key in image_sizes.keys():
            if key == glaucoma_image.shape:
                image_sizes[key] += 1
                break


In [81]:
#Print the shape of the images and the count
for key in image_sizes.keys():
    print("Shape: " + str(key) + " - Count: " + str(image_sizes[key]))

#Write the results to a file
file1 = open(str(os.path.join("D:/Datase_retina_brasil_Jose/Data/Normal/", "InfoShape.txt")), "w")
for key in image_sizes.keys():
    file1.write("Shape: " + str(key) + " - Count: " + str(image_sizes[key]) + "\n")
file1.close()

Shape: (874, 951, 3) - Count: 316
Shape: (2304, 2984, 3) - Count: 3221
Shape: (1880, 2390, 3) - Count: 2511
Shape: (2056, 2672, 3) - Count: 3960
Shape: (1880, 2420, 3) - Count: 1304
