In [1]:
import os, sys
from pathlib import Path
sys.path.append(str(Path().resolve().parents[2]))
import numpy as np
import torch
from PIL import Image as PILImage
from datasets import Dataset, load_from_disk
from tqdm.auto import tqdm
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms
from src.chatbot import pathtree
from dprep import build_dataset

In [2]:
save_dir = pathtree().get("dataset")
dataset = build_dataset(os.path.join(save_dir, "ltdataset"))

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def pre_data(examples, target_size=(600, 600)):
    import torch
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = {"image": [], "label": []}
    for i, (image, label) in enumerate(zip(examples["image"], examples["label"])):
        try:
            print(f"Sample {i}: Type={type(image)}, Value={image if image is not None else 'None'}")
            if image is None or not isinstance(image, (PILImage.Image, np.ndarray)):
                print(f"Sample {i}: Invalid input {type(image)}, using default image")
                image = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
            if isinstance(image, np.ndarray):
                print(f"Sample {i}: Converting numpy array to PIL Image")
                image = Image.fromarray(image)

            image = image.convert('RGB').resize(target_size)
            image_tensor = transforms.ToTensor()(image).to(device)
            results["image"].append(image_tensor)
            print(f"Sample {i}: Successfully processed")
        
        except Exception as e:
            print(f"Error at sample {i}: {e}")
            results["image"].append(torch.zeros(3, target_size[0], target_size[1]).to(device))
        
        results["label"].append(label)
    return results

pre_dir = os.path.join(save_dir, "preprocessed")
os.makedirs(pre_dir, exist_ok=True)
if not ("train_preprocessed" in os.listdir(pre_dir) and "test_preprocessed" in os.listdir(pre_dir)):
    train_data = dataset['train'].map(pre_data, batched=True, batch_size=64, num_proc=4)
    test_data = dataset['test'].map(pre_data, batched=True, batch_size=64, num_proc=4)
    train_data.save_to_disk(os.path.join(pre_dir, "train_preprocessed"))
    test_data.save_to_disk(os.path.join(pre_dir, "test_preprocessed"))
else:
    train_data = load_from_disk(os.path.join(pre_dir, "train_preprocessed"))
    test_data = load_from_disk(os.path.join(pre_dir, "test_preprocessed"))

Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
if not all(fname in os.listdir(pre_dir) for fname in ["train_images.pt", "train_labels.pt", "test_images.pt", "test_labels.pt"]):
    train_images = [data["image"] if isinstance(data["image"], torch.Tensor) else torch.as_tensor(data["image"]) for data in train_data]
    train_labels = [data["label"] for data in train_data]
    test_images = [data["image"] if isinstance(data["image"], torch.Tensor) else torch.as_tensor(data["image"]) for data in test_data]
    test_labels = [data["label"] for data in test_data]

    train_images = torch.stack(train_images)
    train_labels = torch.tensor(train_labels, dtype=torch.long)
    test_images = torch.stack(test_images)
    test_labels = torch.tensor(test_labels, dtype=torch.long)

    print("Train images shape:", train_images.shape, "dtype:", train_images.dtype)

    # Save tensors using torch.save
    torch.save(train_images, os.path.join(pre_dir, 'train_images.pt'))
    torch.save(train_labels, os.path.join(pre_dir, 'train_labels.pt'))
    torch.save(test_images, os.path.join(pre_dir, 'test_images.pt'))
    torch.save(test_labels, os.path.join(pre_dir, 'test_labels.pt'))
else:
    # Load tensors using torch.load
    train_images = torch.load(os.path.join(pre_dir, 'train_images.pt'))
    train_labels = torch.load(os.path.join(pre_dir, 'train_labels.pt'))
    test_images = torch.load(os.path.join(pre_dir, 'test_images.pt'))
    test_labels = torch.load(os.path.join(pre_dir, 'test_labels.pt'))
    print("Loaded train images shape:", train_images.shape, "dtype:", train_images.dtype)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(train_images, train_labels)
test_dataset = TensorDataset(test_images, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000021FDC5B9590>>
Traceback (most recent call last):
  File "c:\Users\trtie\anaconda3\envs\Astorine\Lib\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\trtie\anaconda3\envs\Astorine\Lib\threading.py", line 1501, in enumerate
    def enumerate():
    
KeyboardInterrupt: 
