In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

In [2]:
# Path to your data
data_dir = './data'
dataset_dir = './dataset'

In [3]:
# Ensure the dataset folder structure exists
os.makedirs(os.path.join(dataset_dir, 'train'), exist_ok=True)
os.makedirs(os.path.join(dataset_dir, 'val'), exist_ok=True)
os.makedirs(os.path.join(dataset_dir, 'test'), exist_ok=True)

In [4]:
# Dictionary to store the count of images in each split by building
split_counts = {
    'train': {},
    'val': {},
    'test': {}
}

In [5]:
# Loop through each building folder in the data directory
for building_name in os.listdir(data_dir):
    building_path = os.path.join(data_dir, building_name)

    # Check if it's a directory and not a file
    if os.path.isdir(building_path):
        # Count the total number of images before dataset creation
        total_images_before = len([f for f in os.listdir(building_path) if os.path.isfile(os.path.join(building_path, f))])
        print(f"Building '{building_name}' - Before dataset creation: {total_images_before} images")

        # Make corresponding directories in the dataset folder
        os.makedirs(os.path.join(dataset_dir, 'train', building_name), exist_ok=True)
        os.makedirs(os.path.join(dataset_dir, 'val', building_name), exist_ok=True)
        os.makedirs(os.path.join(dataset_dir, 'test', building_name), exist_ok=True)

        # Get all image files in the building folder
        image_files = [f for f in os.listdir(building_path) if os.path.isfile(os.path.join(building_path, f))]
        
        # Split images into train, val, and test sets (70/15/15)
        train_files, temp_files = train_test_split(image_files, test_size=0.3, random_state=42)
        val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)  # 0.5 * 0.3 = 0.15

        # Move files to the appropriate directories
        for file in train_files:
            shutil.copy(os.path.join(building_path, file), os.path.join(dataset_dir, 'train', building_name, file))
        
        for file in val_files:
            shutil.copy(os.path.join(building_path, file), os.path.join(dataset_dir, 'val', building_name, file))
        
        for file in test_files:
            shutil.copy(os.path.join(building_path, file), os.path.join(dataset_dir, 'test', building_name, file))

        # Update the counts for each split
        split_counts['train'][building_name] = len(train_files)
        split_counts['val'][building_name] = len(val_files)
        split_counts['test'][building_name] = len(test_files)

        # Count the total number of images after dataset creation
        total_images_after = len(train_files) + len(val_files) + len(test_files)
        print(f"Building '{building_name}' - After dataset creation: {total_images_after} images")


Building 'McCool Hall' - Before dataset creation: 1344 images
Building 'McCool Hall' - After dataset creation: 1344 images
Building 'Simrall Hall' - Before dataset creation: 1132 images
Building 'Simrall Hall' - After dataset creation: 1132 images
Building 'Student Union' - Before dataset creation: 1296 images
Building 'Student Union' - After dataset creation: 1296 images
Building 'Swalm Hall' - Before dataset creation: 1349 images
Building 'Swalm Hall' - After dataset creation: 1349 images
Building 'Walker Hall' - Before dataset creation: 1251 images
Building 'Walker Hall' - After dataset creation: 1251 images
Building 'Carpenter Hall' - Before dataset creation: 1177 images
Building 'Carpenter Hall' - After dataset creation: 1177 images
Building 'Butler Hall' - Before dataset creation: 1167 images
Building 'Butler Hall' - After dataset creation: 1167 images
Building 'McCain Hall' - Before dataset creation: 1277 images
Building 'McCain Hall' - After dataset creation: 1277 images
Buildi

In [6]:
# Print out the number of images in each split
print("\nNumber of images in each split by building:")
for split in ['train', 'val', 'test']:
    print(f"\n{split.capitalize()} split:")
    for building_name, count in split_counts[split].items():
        print(f"  {building_name}: {count} images")

# Print out the total number of images for each building
print("\nTotal number of images per building after dataset creation:")
for building_name in split_counts['train']:
    total_images = split_counts['train'][building_name] + split_counts['val'][building_name] + split_counts['test'][building_name]
    print(f"  {building_name}: {total_images} total images")

print("\nDataset creation complete.")


Number of images in each split by building:

Train split:
  McCool Hall: 940 images
  Simrall Hall: 792 images
  Student Union: 907 images
  Swalm Hall: 944 images
  Walker Hall: 875 images
  Carpenter Hall: 823 images
  Butler Hall: 816 images
  McCain Hall: 893 images
  Lee Hall: 877 images
  Old Main: 936 images

Val split:
  McCool Hall: 202 images
  Simrall Hall: 170 images
  Student Union: 194 images
  Swalm Hall: 202 images
  Walker Hall: 188 images
  Carpenter Hall: 177 images
  Butler Hall: 175 images
  McCain Hall: 192 images
  Lee Hall: 188 images
  Old Main: 201 images

Test split:
  McCool Hall: 202 images
  Simrall Hall: 170 images
  Student Union: 195 images
  Swalm Hall: 203 images
  Walker Hall: 188 images
  Carpenter Hall: 177 images
  Butler Hall: 176 images
  McCain Hall: 192 images
  Lee Hall: 188 images
  Old Main: 201 images

Total number of images per building after dataset creation:
  McCool Hall: 1344 total images
  Simrall Hall: 1132 total images
  Student U