In [15]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image, UnidentifiedImageError
import glob

In [16]:
class CroppedZooplanktonDataset(Dataset):
    def __init__(self, csv, root_dir, transform=None):
        self.csv_path = os.path.join(root_dir, csv)
        self.annotations = pd.read_csv(self.csv_path)
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.annotations.index)

    
    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        try:
            image = Image.open(img_path).convert('RGB')
        except (UnidentifiedImageError, OSError) as e:
            # Remove the image file
            os.remove(img_path)
            
            # Remove the corresponding row in the CSV
            self.annotations.drop(index, inplace=True)
            self.annotations.reset_index(drop=True, inplace=True)
            self.annotations.to_csv(self.csv_path, index=False)
            
            # Retry the next item
            return self.__getitem__(index + 1)
        
        cropped_image = image
        y_label = self.annotations.iloc[index, 1]
        y_label = 0 if y_label == 0 else 1
            
        if self.transform:
            cropped_image = self.transform(cropped_image)
        
        return (cropped_image, y_label)

In [17]:
input_folder = '/home/ec2-user/SageMaker/dataset/All_Images/2022-09-19'
output_folder = '/home/ec2-user/SageMaker/dataset/uncropped'

In [18]:
os.makedirs(output_folder, exist_ok=True)

In [19]:
#apply transformation only if you are extracting uncropped images.
data_transform = transforms.Compose([
    transforms.Resize((256, 256))
])

In [20]:
dataset = CroppedZooplanktonDataset(csv='/home/ec2-user/SageMaker/annotations/19/annotations_19_final.csv', root_dir=input_folder, transform=
                                   data_transform)

In [21]:
for i in range(len(dataset)):
    cropped_image, _ = dataset[i]  # cropped image
    image_name = dataset.annotations.iloc[i, 0]  # image name
    output_path = os.path.join(output_folder, image_name)  # output path
    cropped_image.save(output_path)

IndexError: index 9614 is out of bounds for axis 0 with size 9614