In [1]:
import os
import random
import shutil

# Specify the folder paths
source_folder = 'CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Training'
destination_folder = 'CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Training_Sampled'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# List all image files in the source folder, excluding files with '._' prefix (macOS metadata files)
image_files = [f for f in os.listdir(source_folder) if f.endswith(('.png', '.jpg', '.jpeg')) and not f.startswith('._')]

# Initialize a dictionary to hold image files by class labels
class_dict = {}

# Extract the class labels and group images by class
for filename in image_files:
    # Assuming the format of the filename is like '12.android.widget.ImageButton.png'
    label = filename.split('-')[-1].split('.')[2]  # Extract the class label (e.g., 'ImageButton')

    # Add the image to the corresponding class label
    if label not in class_dict:
        class_dict[label] = []
    class_dict[label].append(filename)

# Initialize a list to hold selected images
selected_images = []

# Select up to 5000 images from each class, or select all images if there are fewer than 5000
for label, images in class_dict.items():
    if len(images) >= 5000:
        # Randomly select 5000 images from this class
        selected = random.sample(images, 5000)
    else:
        # If a class has fewer than 5000 images, add all of them
        selected = images

    # Add selected images to the overall list
    selected_images.extend(selected)

# Shuffle the selected images
random.shuffle(selected_images)

# Copy the selected and shuffled images to the destination folder
for img in selected_images:
    src_path = os.path.join(source_folder, img)
    dst_path = os.path.join(destination_folder, img)
    shutil.copy(src_path, dst_path)  # Use shutil.move(src_path, dst_path) if you want to move instead of copy

# Print the total number of selected images
print(f"Total number of selected images: {len(selected_images)}")
print(f"Images have been shuffled and copied to {destination_folder}.")


Total number of selected images: 30518
Images have been shuffled and copied to CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Training_Sampled.


In [3]:
import os
import random
import shutil

# Specify the folder paths
source_folder = 'CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Test'
destination_folder = 'CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Test_Sampled'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# List all image files in the source folder, excluding files with '._' prefix (macOS metadata files)
image_files = [f for f in os.listdir(source_folder) if f.endswith(('.png', '.jpg', '.jpeg')) and not f.startswith('._')]

# Move the selected images to the destination folder
for img in image_files:
    src_path = os.path.join(source_folder, img)
    dst_path = os.path.join(destination_folder, img)
    shutil.move(src_path, dst_path)  # Use shutil.move to move the images


# Print the total number of moved images
print(f"Total number of images moved: {len(image_files)}")
print(f"Images have been moved to {destination_folder}.")


Total number of images moved: 19086
Images have been moved to CNN-Data-Final/CNN-Evaluation/Partitioned-Organic-Data-Split/Test_Sampled.
