In [1]:
import numpy as np
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value, concatenate_datasets
import albumentations as A
from albumentations.pytorch import ToTensorV2
import PIL
import cv2

import os

  check_for_updates()


In [2]:
data_path = "dataset_segmented/"

In [3]:
train_df = pd.read_csv(data_path + "train/metadata.csv")
valid_df = pd.read_csv(data_path + "valid/metadata.csv")
test_df = pd.read_csv(data_path + "test/metadata.csv")

In [4]:
train_df

Unnamed: 0,filename,labels
0,train_image_00000.png,"[0.0, 0.0, 1.0]"
1,train_image_00001.png,"[1.0, 1.0, 1.0]"
2,train_image_00002.png,"[0.0, 1.0, 0.0]"
3,train_image_00003.png,"[0.0, 1.0, 0.0]"
4,train_image_00004.png,"[0.0, 1.0, 1.0]"
...,...,...
614,train_image_00614.png,"[1.0, 1.0, 1.0]"
615,train_image_00615.png,"[1.0, 1.0, 0.0]"
616,train_image_00616.png,"[1.0, 0.0, 1.0]"
617,train_image_00617.png,"[1.0, 1.0, 1.0]"


In [5]:
# Append image path to filename
def add_image_path(df, split):
    df['image'] = data_path + split + "/" + df['filename']
    df = df[df['image'].apply(os.path.exists)]
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df, "train")
valid_dataset_df = add_image_path(valid_df, "valid")
test_dataset_df = add_image_path(test_df, "test")

In [6]:
train_dataset_df

Unnamed: 0,image,labels
0,dataset_segmented/train/train_image_00000.png,"[0.0, 0.0, 1.0]"
1,dataset_segmented/train/train_image_00001.png,"[1.0, 1.0, 1.0]"
2,dataset_segmented/train/train_image_00002.png,"[0.0, 1.0, 0.0]"
3,dataset_segmented/train/train_image_00003.png,"[0.0, 1.0, 0.0]"
4,dataset_segmented/train/train_image_00004.png,"[0.0, 1.0, 1.0]"
...,...,...
614,dataset_segmented/train/train_image_00614.png,"[1.0, 1.0, 1.0]"
615,dataset_segmented/train/train_image_00615.png,"[1.0, 1.0, 0.0]"
616,dataset_segmented/train/train_image_00616.png,"[1.0, 0.0, 1.0]"
617,dataset_segmented/train/train_image_00617.png,"[1.0, 1.0, 1.0]"


In [7]:
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=15, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.5),
    A.HueSaturationValue(hue_shift_limit=15, sat_shift_limit=25, val_shift_limit=15, p=0.5),
    A.RandomGamma(gamma_limit=(90, 110), p=0.5),
    A.GaussNoise(var_limit=(0.0, 0.01), p=0.5)
])

In [14]:
def augment_images(df, transform, num_augmentations=8):
    augmented_datasets = []
    # Keep original dataset
    augmented_datasets.append(df)
    
    # Create augmentation directory if it doesn't exist
    aug_dir = os.path.join(data_path, 'augmented')
    os.makedirs(aug_dir, exist_ok=True)
    
    # Create num_augmentations copies
    for i in range(num_augmentations):
        augmented_df = df.copy()
        augmented_images = []
        
        for idx, row in df.iterrows():
            # Read and augment image
            image = cv2.imread(row['image'])
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            augmented = transform(image=image)
            
            # Generate filename for augmented image
            base_filename = os.path.basename(row['image'])
            aug_filename = f"aug_{i}_{base_filename}"
            aug_filepath = aug_dir + '/' + aug_filename
            
            # Save augmented image
            cv2.imwrite(aug_filepath, cv2.cvtColor(augmented['image'], cv2.COLOR_RGB2BGR))
            augmented_images.append(aug_filepath)
            
        augmented_df['image'] = augmented_images
        augmented_datasets.append(augmented_df)
    
    # Combine all datasets
    combined_dataset_df = pd.concat(augmented_datasets, ignore_index=True)
    
    # Trim to desired size if necessary
    if len(combined_dataset_df) > 5222:
        combined_dataset_df = combined_dataset_df.iloc[:5222]
        
    return combined_dataset_df

In [15]:
combined_dataset_df = augment_images(train_dataset_df, transform)

In [20]:
def convert_labels(df):
    # Convert string representation of lists to actual lists of floats
    df['labels'] = df['labels'].apply(lambda x: np.array(eval(x), dtype=np.float32))
    return df

In [21]:
combined_dataset_df = convert_labels(combined_dataset_df)
valid_dataset_df = convert_labels(valid_dataset_df)
test_dataset_df = convert_labels(test_dataset_df)

In [22]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [23]:
train_dataset = Dataset.from_pandas(combined_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [24]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [26]:
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [27]:
def preprocess_function(examples):
    images = image_processor(examples['image'], return_tensors='pt')
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    
    return {
        'pixel_values': images['pixel_values'],
        'labels': labels
    }

In [28]:
dataset_dict = dataset_dict.map(preprocess_function)

Map:   0%|          | 0/5222 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

In [29]:
dataset_dict.push_to_hub("e1010101/tongue-images-384-segmented-augmented", private=True)

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/e1010101/tongue-images-384-segmented-augmented/commit/faef24304c30dfdc9875b16c5dcdaaa98a7eb18c', commit_message='Upload dataset', commit_description='', oid='faef24304c30dfdc9875b16c5dcdaaa98a7eb18c', pr_url=None, pr_revision=None, pr_num=None)