Dataset from [Kaggle](https://www.kaggle.com/c/aptos2019-blindness-detection/overview)

In [1]:
import sys
sys.path.append("../src")  
from imports import *
import importdata

#Import Data
importdata.main()
df_train = pd.read_csv('../data/raw/aptos2019-blindness-detection/train.csv')

Data not found. Creating directory at ../data/raw...
Directory ready. Downloading data...


KeyboardInterrupt: 

In [None]:
train_image_dir = '../data/raw/aptos2019-blindness-detection/train_images/'

class_labels = {
    0: 'No DR',
    1: 'Mild',
    2: 'Moderate',
    3: 'Severe',
    4: 'Proliferative'
}

plt.figure(figsize=(20, 10))

for i in range(5):
    # Find the first image for the current class
    sample_id = df_train[df_train['diagnosis'] == i].iloc[0]['id_code']
    image_path = os.path.join(train_image_dir, f"{sample_id}.png")
    
    # Load the image
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert for matplotlib
    img = cv2.resize(img, (224, 224)) # Resize for consistent display
    
    # Display the image
    plt.subplot(1, 5, i+1)
    plt.imshow(img)
    plt.title(f"Class: {i} ({class_labels[i]})")
    plt.axis('off')
    
plt.show()

In [None]:
class_counts = df_train['diagnosis'].value_counts().sort_index()

# Plot the distribution
plt.figure(figsize=(10, 6))

sns.barplot(x=class_counts.index, y=class_counts.values, palette="viridis", hue=class_counts.index, legend=False)

plt.title('Distribution of Diabetic Retinopathy Classes')
plt.xlabel('Diagnosis (Severity)')
plt.ylabel('Number of Images')
plt.xticks(ticks=[0, 1, 2, 3, 4], labels=['No DR', 'Mild', 'Moderate', 'Severe', 'Proliferative'])
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# --- 1. Splitting the Data ---
# Create a stratified 80/20 split. Stratification is crucial due to class imbalance.
X = df_train['id_code']
y = df_train['diagnosis']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Combine them back into dataframes for our dataset class
train_df = pd.DataFrame({'id_code': X_train, 'diagnosis': y_train})
val_df = pd.DataFrame({'id_code': X_val, 'diagnosis': y_val})

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print("\nTraining set distribution:\n", train_df['diagnosis'].value_counts(normalize=True))
print("\nValidation set distribution:\n", val_df['diagnosis'].value_counts(normalize=True))


# --- 2. Defining Transformations ---
# As per the methodology: resize, normalize, and augment the training set.
# The validation set is only resized and normalized.
IMG_SIZE = 224
# Normalization values for ImageNet-pretrained models
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    normalize
])

val_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    normalize
])


# --- 3. Creating a Custom PyTorch Dataset ---
class DRDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.df.iloc[idx]['id_code'] + '.png')
        # PIL Image is the standard for torchvision transforms
        image = Image.open(img_name).convert('RGB')
        label = self.df.iloc[idx]['diagnosis']

        if self.transform:
            image = self.transform(image)

        return image, label

# Instantiate the datasets
train_dataset = DRDataset(df=train_df, image_dir=train_image_dir, transform=train_transforms)
val_dataset = DRDataset(df=val_df, image_dir=train_image_dir, transform=val_transforms)


# --- 4. Setting up DataLoaders ---
# This will handle batching and shuffling during training.
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Let's check a sample batch to ensure it's working
images, labels = next(iter(train_loader))
print(f"\nShape of a batch of images: {images.shape}") # (Batch Size, Channels, Height, Width)
print(f"Shape of a batch of labels: {labels.shape}")
print(f"Data type of images: {images.dtype}")
print(f"Labels in the first batch: {labels}")