In [None]:
import os
import glob
import random
import shutil
from PIL import Image
import numpy as np
import torch
from torchvision import models, transforms
from torch.utils.data import DataLoader




# class Hotdog_NotHotdog(torch.utils.data.Dataset):
#     def __init__(self, train=True, transform=None, data_path='/content/drive/MyDrive/datasets/hotdog_nothotdog', train_ratio=0.8):
#         'Initialization'
#         self.transform = transform

#         # Load all images from both train and test directories
#         all_image_paths = glob.glob(os.path.join(data_path, '*/*/*.jpg'))

#         # Split the images into classes
#         image_classes = {os.path.split(os.path.split(path)[0])[
#             1]: [] for path in all_image_paths}
#         for path in all_image_paths:
#             class_name = os.path.split(os.path.split(path)[0])[1]
#             image_classes[class_name].append(path)

#         # Shuffle and split the dataset based on the specified ratio
#         self.image_paths = []
#         for class_name, paths in image_classes.items():
#             random.shuffle(paths)  # Shuffle the images for randomness
#             split_index = int(len(paths) * train_ratio)
#             if train:
#                 self.image_paths.extend(paths[:split_index])  # Training data
#             else:
#                 self.image_paths.extend(paths[split_index:])  # Testing data

#         self.name_to_label = {c: id for id,
#                               c in enumerate(image_classes.keys())}

#     def __len__(self):
#         'Returns the total number of samples'
#         return len(self.image_paths)

#     def __getitem__(self, idx):
#         'Generates one sample of data'
#         image_path = self.image_paths[idx]

#         image = Image.open(image_path)
#         c = os.path.split(os.path.split(image_path)[0])[1]
#         y = self.name_to_label[c]
#         X = self.transform(image)
#         return X, y


class Hotdog_NotHotdog(torch.utils.data.Dataset):
    def __init__(self, train, transform, data_path='./data/hotdog_nothotdog'):
        'Initialization'
        self.transform = transform
        data_path = os.path.join(data_path, 'train' if train else 'test')
        image_classes = [os.path.split(d)[1] for d in glob.glob(
            data_path + '/*') if os.path.isdir(d)]
        image_classes.sort()
        self.name_to_label = {c: id for id, c in enumerate(image_classes)}
        self.image_paths = glob.glob(data_path + '/*/*.jpg')

    def __len__(self):
        'Returns the total number of samples'
        return len(self.image_paths)

    def __getitem__(self, idx):
        'Generates one sample of data'
        image_path = self.image_paths[idx]

        image = Image.open(image_path)
        c = os.path.split(os.path.split(image_path)[0])[1]
        y = self.name_to_label[c]
        X = self.transform(image)
        return X, y


def compute_pca_from_dataset(dataset):
    # Collect all the images into a list
    images = []
    for img in dataset:
        # Reshape each image to (N, 3) for RGB
        images.append(np.array(img).reshape(-1, 3))

    pixels = np.vstack(images)  # Stack all images into a single (M, 3) array
    cov_matrix = np.cov(pixels, rowvar=False)  # Calculate covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(
        cov_matrix)  # Eigen decomposition
    return eigenvalues, eigenvectors


def apply_pca_color_jitter(image, eigenvalues, eigenvectors):
    # Draw random variables from a Gaussian
    alpha = np.random.normal(0, 0.1, 3)  # One for each channel
    # Apply PCA jitter
    jitter = np.dot(eigenvectors, alpha * eigenvalues)
    # Convert image to array and add jitter
    img_array = np.array(image) / 255.0  # Normalize to [0, 1]
    img_jittered = img_array + jitter  # Add jitter
    img_jittered = np.clip(img_jittered, 0, 1)  # Ensure values stay in [0, 1]
    # Convert back to image
    return Image.fromarray((img_jittered * 255).astype(np.uint8))


def load_and_transform_dataset(image_resize: int = 224, batch_size: int = 64, train_ratio: float = 0.8, recalculate_normalization: bool = False, data_path='/content/drive/MyDrive/datasets/hotdog_nothotdog'):

    min_scale = 256
    max_scale = 480
    target_size = (image_resize, image_resize)

    train_dataset_no_transform = Hotdog_NotHotdog(
        train=True, return_image_only=True, data_path=data_path)
    eigenvalues, eigenvectors = compute_pca_from_dataset(
        train_dataset_no_transform)

    normalize = transforms.Normalize(
        mean=[0.5244, 0.4443, 0.3621],
        std=[0.2679, 0.2620, 0.2733],
    )

    train_transform = transforms.Compose([
        transforms.Lambda(lambda img: img.resize(
            (random.randint(min_scale, max_scale),
             int(img.size[1] * (random.randint(min_scale, max_scale) / img.size[0])))
        )),
        # transforms.Lambda(lambda img: apply_pca_color_jitter(
        #     img, eigenvalues, eigenvectors)),
        # transforms.RandomCrop(target_size),
        transforms.Resize((256, 256)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        normalize,
    ])

    test_transform = transforms.Compose([
        # Resizing for consistency look into the paper!!!
        transforms.Resize((256, 256)),
        # transforms.CenterCrop(target_size),
        transforms.ToTensor(),
        # normalize,
    ])

    trainset = Hotdog_NotHotdog(
        train=True, transform=train_transform, data_path=data_path)
    testset = Hotdog_NotHotdog(
        train=False, transform=test_transform, data_path=data_path)
    train_loader = DataLoader(
        trainset, batch_size=batch_size, shuffle=True, num_workers=3)
    test_loader = DataLoader(
        testset, batch_size=batch_size, shuffle=False, num_workers=3)

    return trainset, testset, train_loader, test_loader
