This notebook performs augmentation on a local dataset, then uploads it to Huggingface.

In [1]:
import numpy as np
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value
import albumentations as A
import cv2

import os

  check_for_updates()


In [2]:
data_path = "dataset_segmented/"

In [29]:
train_df = pd.read_csv(data_path + "train/metadata.csv")
valid_df = pd.read_csv(data_path + "valid/metadata.csv")
test_df = pd.read_csv(data_path + "test/metadata.csv")

In [30]:
train_df

Unnamed: 0,filename,labels
0,train_image_00000.png,"[0.0, 0.0, 1.0]"
1,train_image_00001.png,"[1.0, 1.0, 1.0]"
2,train_image_00002.png,"[0.0, 1.0, 0.0]"
3,train_image_00003.png,"[0.0, 1.0, 0.0]"
4,train_image_00004.png,"[0.0, 1.0, 1.0]"
...,...,...
614,train_image_00614.png,"[1.0, 1.0, 1.0]"
615,train_image_00615.png,"[1.0, 1.0, 0.0]"
616,train_image_00616.png,"[1.0, 0.0, 1.0]"
617,train_image_00617.png,"[1.0, 1.0, 1.0]"


In [31]:
# Append image path to filename
def add_image_path(df, split):
    df['image'] = data_path + split + "/" + df['filename']
    df = df[df['image'].apply(os.path.exists)]
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df, "train")
valid_dataset_df = add_image_path(valid_df, "valid")
test_dataset_df = add_image_path(test_df, "test")

In [32]:
train_dataset_df

Unnamed: 0,image,labels
0,dataset_segmented/train/train_image_00000.png,"[0.0, 0.0, 1.0]"
1,dataset_segmented/train/train_image_00001.png,"[1.0, 1.0, 1.0]"
2,dataset_segmented/train/train_image_00002.png,"[0.0, 1.0, 0.0]"
3,dataset_segmented/train/train_image_00003.png,"[0.0, 1.0, 0.0]"
4,dataset_segmented/train/train_image_00004.png,"[0.0, 1.0, 1.0]"
...,...,...
614,dataset_segmented/train/train_image_00614.png,"[1.0, 1.0, 1.0]"
615,dataset_segmented/train/train_image_00615.png,"[1.0, 1.0, 0.0]"
616,dataset_segmented/train/train_image_00616.png,"[1.0, 0.0, 1.0]"
617,dataset_segmented/train/train_image_00617.png,"[1.0, 1.0, 1.0]"


In [7]:
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=15, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.5),
    A.HueSaturationValue(hue_shift_limit=15, sat_shift_limit=25, val_shift_limit=15, p=0.5),
    A.RandomGamma(gamma_limit=(90, 110), p=0.5),
    A.GaussNoise(var_limit=(0.0, 0.01), p=0.5)
])

In [8]:
def augment_images(df, transform, num_augmentations=8):
    augmented_images = []
    augmented_labels = []
    
    aug_dir = os.path.join(data_path, 'augmented')
    os.makedirs(aug_dir, exist_ok=True)
    
    # Calculate how many augmentations we need
    target_size = 5222
    current_size = len(df)
    needed_augmentations = target_size - current_size
    augmentations_per_image = needed_augmentations // current_size
    remainder = needed_augmentations % current_size
    
    for idx, row in df.iterrows():
        # Determine number of augmentations for this image
        num_aug = augmentations_per_image + (1 if idx < remainder else 0)
        
        for i in range(num_aug):
            # Read and augment image
            image = cv2.imread(row['image'])
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            augmented = transform(image=image)
            
            # Generate filename for augmented image
            base_filename = os.path.basename(row['image'])
            aug_filename = f"aug_{i}_{base_filename}"
            aug_filepath = aug_dir + "/" + aug_filename
            
            # Save augmented image
            cv2.imwrite(aug_filepath, cv2.cvtColor(augmented['image'], cv2.COLOR_RGB2BGR))
            
            # Store filepath and labels
            augmented_images.append(aug_filepath)
            augmented_labels.append(row['labels'])
    
    # Create new dataframe for augmented images
    augmented_df = pd.DataFrame({
        'image': augmented_images,
        'labels': augmented_labels
    })
    
    return augmented_df

In [9]:
augmented_df = augment_images(train_dataset_df, transform)

Unnamed: 0,image,labels
0,dataset_segmented/augmented\aug_0_train_image_...,"[0.0, 0.0, 1.0]"
1,dataset_segmented/augmented\aug_1_train_image_...,"[0.0, 0.0, 1.0]"
2,dataset_segmented/augmented\aug_2_train_image_...,"[0.0, 0.0, 1.0]"
3,dataset_segmented/augmented\aug_3_train_image_...,"[0.0, 0.0, 1.0]"
4,dataset_segmented/augmented\aug_4_train_image_...,"[0.0, 0.0, 1.0]"
...,...,...
4598,dataset_segmented/augmented\aug_2_train_image_...,"[0.0, 1.0, 1.0]"
4599,dataset_segmented/augmented\aug_3_train_image_...,"[0.0, 1.0, 1.0]"
4600,dataset_segmented/augmented\aug_4_train_image_...,"[0.0, 1.0, 1.0]"
4601,dataset_segmented/augmented\aug_5_train_image_...,"[0.0, 1.0, 1.0]"


In [24]:
augmented_df

Unnamed: 0,image,labels
0,dataset_segmented/augmented\aug_0_train_image_...,"[0.0, 0.0, 1.0]"
1,dataset_segmented/augmented\aug_1_train_image_...,"[0.0, 0.0, 1.0]"
2,dataset_segmented/augmented\aug_2_train_image_...,"[0.0, 0.0, 1.0]"
3,dataset_segmented/augmented\aug_3_train_image_...,"[0.0, 0.0, 1.0]"
4,dataset_segmented/augmented\aug_4_train_image_...,"[0.0, 0.0, 1.0]"
...,...,...
4598,dataset_segmented/augmented\aug_2_train_image_...,"[0.0, 1.0, 1.0]"
4599,dataset_segmented/augmented\aug_3_train_image_...,"[0.0, 1.0, 1.0]"
4600,dataset_segmented/augmented\aug_4_train_image_...,"[0.0, 1.0, 1.0]"
4601,dataset_segmented/augmented\aug_5_train_image_...,"[0.0, 1.0, 1.0]"


In [25]:
def convert_labels(df):
    # Convert string representation of lists to actual lists of floats
    df['labels'] = df['labels'].apply(lambda x: np.array(eval(x), dtype=np.float32))
    return df

In [34]:
combined_dataset_df = pd.concat([train_dataset_df, augmented_df], ignore_index=True)

In [35]:
combined_dataset_df = convert_labels(combined_dataset_df)
valid_dataset_df = convert_labels(valid_dataset_df)
test_dataset_df = convert_labels(test_dataset_df)

In [None]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [37]:
train_dataset = Dataset.from_pandas(combined_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [44]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [39]:
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384", use_fast=True)

In [43]:
def preprocess_function(examples):
    images = image_processor(examples['image'], return_tensors='pt')
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    pixel_values = images['pixel_values'][0]
    
    return {
        'pixel_values': pixel_values,
        'labels': labels
    }

In [None]:
dataset_dict = dataset_dict.map(preprocess_function)

In [None]:
# Insert your own URL here
dataset_dict.push_to_hub("YOUR URL HERE")