In [1]:
import numpy as np
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value, concatenate_datasets
import albumentations as A
from albumentations.pytorch import ToTensorV2
import PIL
import cv2

import os

  check_for_updates()


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
data_path = "dataset_segmented/"

In [4]:
train_df = pd.read_csv(data_path + "train/_classes.csv")
valid_df = pd.read_csv(data_path + "valid/_classes.csv")
test_df = pd.read_csv(data_path + "test/_classes.csv")

In [5]:
train_df

Unnamed: 0,filename,labels
0,image_00000.png,"[0.0, 0.0, 1.0]"
1,image_00001.png,"[1.0, 1.0, 1.0]"
2,image_00002.png,"[0.0, 1.0, 0.0]"
3,image_00003.png,"[0.0, 1.0, 0.0]"
4,image_00004.png,"[0.0, 1.0, 1.0]"
...,...,...
614,image_00614.png,"[1.0, 1.0, 1.0]"
615,image_00615.png,"[1.0, 1.0, 0.0]"
616,image_00616.png,"[1.0, 0.0, 1.0]"
617,image_00617.png,"[1.0, 1.0, 1.0]"


In [5]:
label_columns = ['Crack', 'Red-Dots', 'Toothmark']

In [6]:
# Convert binary-encoded features to array of labels
def binary_to_labels(row, label_cols):
    return [float(row[col]) for col in label_cols]

for df in [train_df, valid_df, test_df]:
    df['labels'] = df.apply(lambda row: binary_to_labels(row, label_columns), axis=1)
    df.drop(columns=label_columns, inplace=True)

In [7]:
# Append image path to filename
def add_image_path(df, split):
    df['image'] = data_path + split + "/" + df['filename']
    df = df[df['image'].apply(os.path.exists)]
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df, "train")
valid_dataset_df = add_image_path(valid_df, "valid")
test_dataset_df = add_image_path(test_df, "test")

In [8]:
train_dataset_df

Unnamed: 0,image,labels
0,dataset_original/train/CC_211_jpg.rf.31153880e...,"[0.0, 0.0, 1.0]"
1,dataset_original/train/CC_1873_jpg.rf.30bd76e7...,"[1.0, 1.0, 1.0]"
2,dataset_original/train/CC_1584_jpg.rf.3074bed7...,"[0.0, 1.0, 0.0]"
3,dataset_original/train/CC_1130_jpg.rf.321c01c0...,"[0.0, 1.0, 0.0]"
4,dataset_original/train/CC_526_jpg.rf.33619b9cd...,"[0.0, 1.0, 1.0]"
...,...,...
741,dataset_original/train/CC_162_jpg.rf.fc096c7c2...,"[1.0, 0.0, 1.0]"
742,dataset_original/train/CC_1059_jpg.rf.fd96a222...,"[1.0, 1.0, 1.0]"
743,dataset_original/train/CC_1035_jpg.rf.fb05385f...,"[0.0, 1.0, 1.0]"
744,dataset_original/train/CC_1355_jpg.rf.ffb1d2f1...,"[0.0, 1.0, 1.0]"


In [9]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [10]:
train_dataset = Dataset.from_pandas(train_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [11]:
def rgb_to_lab(example):
    image_np = np.array(example['image'])
    
    # Remove alpha channel if present
    if image_np.shape[-1] == 4:
        image_np = image_np[..., :3]
    
    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
    image_lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB).astype(np.float32)
    
    # Adjust the LAB channels to standard ranges
    # L channel [0, 100]
    image_lab[..., 0] = image_lab[..., 0] * (100 / 255)
    # a and b channels [-128, 127]
    image_lab[..., 1:] = image_lab[..., 1:] - 128
    
    example['image'] = image_lab
    return example

In [None]:
train_dataset_lab = train_dataset.map(rgb_to_lab)
valid_dataset_lab = valid_dataset.map(rgb_to_lab)
test_dataset_lab = test_dataset.map(rgb_to_lab)

Map:   0%|          | 0/746 [00:00<?, ? examples/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000023E566ED810>>
Traceback (most recent call last):
  File "c:\Users\ezrat\anaconda3\envs\Project\Lib\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ezrat\anaconda3\envs\Project\Lib\threading.py", line 1501, in enumerate
    def enumerate():
    
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000023E566ED810>>
Traceback (most recent call last):
  File "c:\Users\ezrat\anaconda3\envs\Project\Lib\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                              

In [None]:
combined_train_dataset = concatenate_datasets([train_dataset, train_dataset_lab])
combined_valid_dataset = concatenate_datasets([valid_dataset, valid_dataset_lab])
combined_test_dataset = concatenate_datasets([test_dataset, test_dataset_lab])