In [12]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image, UnidentifiedImageError
import glob

In [13]:
class CroppedZooplanktonDataset(Dataset):
    def __init__(self, csv, root_dir, transform=None):
        self.annotations = pd.read_csv(os.path.join(root_dir, csv))
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 1])
        image = Image.open(img_path).convert('RGB')
        
        # Extract coordinates for area of interest from the CSV columns
        TL_x = self.annotations.iloc[index, 3]  # Top-left x-coordinate
        TL_y = self.annotations.iloc[index, 4]  # Top-left y-coordinate
        BR_x = self.annotations.iloc[index, 5]  # Bottom-right x-coordinate
        BR_y = self.annotations.iloc[index, 6]  # Bottom-right y-coordinate
                
        cropped_image = image.crop((TL_x, TL_y, BR_x, BR_y))
        
        y_label = self.annotations.iloc[index, 9]
        y_label = 0 if y_label == "marine-snow" else 1
            
        if self.transform:
            cropped_image = self.transform(cropped_image)
        
        return (cropped_image, y_label)

In [14]:
input_folder = '/home/ec2-user/SageMaker/dataset//All_Images/2022-09-19'
output_folder = '/home/ec2-user/SageMaker/dataset/cropped'

In [15]:
os.makedirs(output_folder, exist_ok=True)

In [16]:
#apply transformation only if you are extracting uncropped images.
data_transform = transforms.Compose([
    transforms.Resize((224, 224))
])

In [17]:
dataset = CroppedZooplanktonDataset(csv='/home/ec2-user/SageMaker/dataset/All_Images/2022-09-19/2022-09-19_4.csv', root_dir=input_folder, transform=
                                   data_transform)

In [18]:
# Thsi is sused to save image if same name comes again append with 1
counter = 1
for i in range(len(dataset)):
    cropped_image, _ = dataset[i]  # cropped image
    image_name = dataset.annotations.iloc[i, 1]  # image name
    base_name, ext = os.path.splitext(image_name)
    output_path = os.path.join(output_folder, f"{base_name}_{counter}{ext}")
    counter += 1
    cropped_image.save(output_path)


remeber we have saved images without transform so we have to apply transformation in the dataloader

# Below is used to combine all csv into one 

In [12]:
pwd

'/home/ec2-user/SageMaker'

In [13]:
path = '/home/ec2-user/SageMaker/dataset/2022-09-08/'

In [14]:
all_files = glob.glob(path + "*.csv")

In [15]:
all_files

['/home/ec2-user/SageMaker/dataset/2022-09-08/2022_09_08_1.csv',
 '/home/ec2-user/SageMaker/dataset/2022-09-08/2022-09-08_2.csv']

In [16]:
dataframes = []

In [17]:
for i, file in enumerate(all_files):
    df = pd.read_csv(file)
    if i == 0:
        dataframes.append(df)  # Append header from the first file
    else:
        dataframes.append(df[1:])

In [18]:
combined_df = pd.concat(dataframes, ignore_index=True)

In [19]:
output_path = '/home/ec2-user/SageMaker/dataset/2022-09-08/combined.csv'

In [20]:
combined_df.to_csv(output_path, index=False)