In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

def load_sign_language_dataset(dataset_path):
    images = []
    labels = []

    # Check if the dataset path exists
    if not os.path.isdir(dataset_path):
        print(f"Error: Dataset path '{dataset_path}' not found.")
        return None, None

    print("Loading dataset...")
    # Iterate through each subfolder, which corresponds to a label ('a', 'b', etc.)
    for label_folder in sorted(os.listdir(dataset_path)):
        subfolder_path = os.path.join(dataset_path, label_folder)

        # Ensure it's a directory
        if not os.path.isdir(subfolder_path):
            continue

        # Iterate through each image in the subfolder
        for image_filename in os.listdir(subfolder_path):
            image_path = os.path.join(subfolder_path, image_filename)

            try:
                # Read the image using OpenCV
                # You might want to read in grayscale by adding `cv2.IMREAD_GRAYSCALE`
                image = cv2.imread(image_path)

                if image is not None:
                    # Optional: Resize images to a consistent size, e.g., 64x64
                    # This is crucial for most deep learning models.
                    image = cv2.resize(image, (64, 64))

                    images.append(image)
                    labels.append(label_folder)
                else:
                    print(f"Warning: Could not read image {image_path}. Skipping.")
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")

    print(f"Dataset loaded successfully. Found {len(images)} images.")

    # Convert lists to numpy arrays for efficient processing
    return np.array(images), np.array(labels)


# --- Example Usage ---
if __name__ == '__main__':
    # ==================================================================
    # IMPORTANT: THIS IS THE CORRECT PLACE TO PUT YOUR DATASET PATH
    # The 'r' before the string is important for Windows paths to handle backslashes correctly.
    DATASET_PATH = r"D:\LTIMintree_project\Dataset"
    # ==================================================================


    # Load the data
    X, y = load_sign_language_dataset(DATASET_PATH)

    if X is not None and y is not None:
        print("\n--- Data Summary ---")
        print(f"Shape of images array (X): {X.shape}")
        print(f"Shape of labels array (y): {y.shape}")

        # Print some sample labels to verify
        print(f"\nFirst 5 labels: {y[:5]}")
        print(f"Last 5 labels: {y[-5:]}")

        # It's common practice to split the data into training and testing sets
        # Here's how you can do it using scikit-learn
        print("\nSplitting data into training and testing sets...")
        
        # NOTE: The 'stratify=y' parameter was removed to fix a ValueError.
        # This error occurs if any letter's subfolder has only one image.
        # The BEST solution is to ensure each letter subfolder has at least 2 images.
        # This code will now run, but the data split might be slightly unbalanced.
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        print(f"Training data shape: {X_train.shape}")
        print(f"Testing data shape: {X_test.shape}")
        print(f"Training labels shape: {y_train.shape}")
        print(f"Testing labels shape: {y_test.shape}")



Loading dataset...
Dataset loaded successfully. Found 20084 images.

--- Data Summary ---
Shape of images array (X): (20084, 64, 64, 3)
Shape of labels array (y): (20084,)

First 5 labels: ['A' 'A' 'A' 'A' 'A']
Last 5 labels: ['Z' 'Z' 'Z' 'Z' 'Z']

Splitting data into training and testing sets...
Training data shape: (16067, 64, 64, 3)
Testing data shape: (4017, 64, 64, 3)
Training labels shape: (16067,)
Testing labels shape: (4017,)
