# Preprocess Labelled Image Data for use in YOLO Model

This notebook contains a collection of functions/scripts to preprocess labelled images of target host trees for object detection.
Included are functions to generate summaries of bounding boxes/ remove images with large bounding boxes (i.e. those for classification, not detection)



In [1]:

# Import Libraries
import tqdm
from glob import glob
import datetime
import numpy as np
import pandas as pd
import csv
import os
import io
import shutil
import random

# Image Libraries
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.image import imread
from IPython.display import clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# Process and modify class labels in .txt files

In YOLO models, all labels must match a corresponding image and should start from an index of 0.

In one case, the "tree" class has the label 3.

All instances of the "Ailanthus" should have the label 1.

For positive Ailanthus labels, change the class value in each txt label file to the class index 1

In [2]:
# Count all unique class indices in the dataset (first value in labels.txt files)

# Path to the directory containing .txt files
#txt_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_2200ailanthus_10000trees_images_labels_dec23/all_labels'

txt_directory = r'C:\Users\talake2\Desktop\auto_arborist_cvpr2022_v015\yolov5\datasets\autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324\all_labels'

# Dictionary to store counts of each class
class_counts = {}

# Iterate through .txt files and modify the content
for filename in os.listdir(txt_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_directory, filename)
        with open(txt_file_path, 'r') as file:
            # Read the content of the file
            lines = file.readlines()
            #print(lines)
        
        all_class_indices = []
        for line in lines:
            values = line.split(' ') # split values by space delimiter
            class_index = values[0]

            # Count occurrences of each class
            if class_index in class_counts:
                class_counts[class_index] += 1
            else:
                class_counts[class_index] = 1

# Print unique class indices and their counts
print(f'Class Indices in Labels: {np.unique(list(class_counts.keys()))}')
print(f'Number of Labels per Class: {class_counts}')

Class Indices in Labels: ['\n' '0' '1']
Number of Labels per Class: {'0': 22156, '\n': 29479, '1': 7323}


In [None]:
# Remove unwanted labels and label files
# Here, we remove any lines that do not start with class 0 (Trees) and class 3 (Ailanthus)

txt_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_images_labels_dec23/labels'

# Iterate through .txt files and modify the content
for filename in os.listdir(txt_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_directory, filename)
        with open(txt_file_path, 'r') as file:
            # Read the content of the file
            lines = file.readlines()

        modified_lines = []  # Modify the first value of each line
        for line in lines:
            values = line.split()  # Split values by space delimiter

            # Check if the line starts with '0', keep it; otherwise, remove it
            if values[0] == '0':
                modified_lines.append(line)
            if values[0] == '3':
                modified_lines.append(line)

        # Write the modified content back to the file
        with open(txt_file_path, 'w') as file:
            file.writelines(modified_lines)

        print(f"Modified {filename}")


In [None]:
# Modify Class Labels for Trees to begin with 0 and Ailanthus to begin with index 1

# Example Label: Ailanthus_altissima_112403.txt 1 0.497737556561086 0.4819004524886878 0.9864253393665159 0.9547511312217195

# Path to the directory containing .txt files
txt_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/googlestreetview_ailanthus_images_labels_dec23/labels'

# Iterate through .txt files and modify the content
for filename in os.listdir(txt_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_directory, filename)
        with open(txt_file_path, 'r') as file:
            # Read the content of the file
            lines = file.readlines()
            #print(lines)

        modified_lines = []  # Modify the first value of each line
        for line in lines:
            values = line.split()  # Split values by space delimiter
            #print(values[0])
            if values[0] == '0':  # Check if the line contains the class label index specific to the speciesChange the first class index value to '1'
                #print("Tree")
                values[0] = '1' # Change the label index to the correct index
                modified_lines.append(' '.join(values) + os.linesep) #append each line with a Unix '/n' newline LF character
            elif values[0] == '1':
                values[0] = '0'
                #print("Ailanthus")
                modified_lines.append(' '.join(values) + os.linesep) #append each line with a Unix '/n' newline LF character
                

        #print(modified_lines)
        # Write the modified content back to the file
        with open(txt_file_path, 'w') as file:
            file.writelines(modified_lines)

        print(f"Modified {filename}")




# Get Images from Labels by Filename

In LabelStudio, export labels as the "YOLO" Format. 
A .TXT format is created for each image file. Each txt file contains annotations for the corresponding image file, that is object class, object coordinates, height & width.

As the .TXT format label files are named identically to the .JPG images, search through an image directory and pull the images to a new folder.


In [None]:

# Path to the directory containing .txt files
# This is the export directory from LabelStudio, often named 'project-number-at-year-month-day.zip'
labels_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_images_labels_dec23/labels'

# Network drive or local directory containing images to search for using labels
network_drive = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/auto_arborist_jpegs/jpegs_streetlevel_genus_idx_label/ailanthus/images'

# Destination directory to copy the images
destination_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_images_labels_dec23/images'

# Iterate through .txt files and copy corresponding images
for filename in os.listdir(labels_directory):
    if filename.endswith('.txt'):
        # Extract the image file name without extension from the .txt file
        image_filename = os.path.splitext(filename)[0] + '.jpeg'
        
        # Check if the corresponding image file exists in the network drive
        source_image_path = os.path.join(network_drive, image_filename)
        if os.path.exists(source_image_path):
            # Copy the image file to the destination directory
            destination_image_path = os.path.join(destination_directory, image_filename)
            shutil.copy2(source_image_path, destination_image_path)
            print(f"Copied: {image_filename}")

print("Copying completed.")





# Resize image dimension 

In [None]:
# Resize iNaturalist images to match dimensions of autoarborist dataset
# Input iNaturalist image size: XXX x XXX pixels
# Target image size: 768 x 1152 pixels

import cv2

# Path to the directory containing .jpg images
image_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_images_labels_dec23/test/images/'

txt_directory = r'C:/Users/talake2/Desktop/ailanthus_labelstudio_nov1323/labels'

# New dimensions for the resized image
new_width = 768
new_height = 768

# Iterate through .jpg files and their corresponding .txt files
for filename in os.listdir(image_directory):
    if filename.endswith('.jpeg'):
        # Load the image
        image_path = os.path.join(image_directory, filename)
        img = cv2.imread(image_path)
        
        # Resize the image
        resized_img = cv2.resize(img, (new_width, new_height))
    
        # Save resized image
        cv2.imwrite(image_path, resized_img)
        
        print(f"Resized and saved: {filename}")

print("Resizing completed.")






# Subset Autoarborist Tree Images and Labels for Training and Testing

Randomly select Images of Trees from Autoarborist and Ailanthus (From Autoarborist) Combined into Training and Testing Directories

Images are randomly shuffled and sampled to avoid duplicating images in both training and testing directories.

Files are copied from their respective source directories to a common target directory containing the training and testing data.



In [4]:


# Subset Autoarborist Tree Images and Labels for Training and Testing

# Define whether to copy images and labels to the training or testing directory, and how many to copy
num_files_per_source = 7174 #total number of files in /images or /labels

# Directory Structure
# YOLO Model Datasets Directory -> 'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/'
# Create Directory for Your Dataset -> 'autoarborist_ailanthus_images_nov2723/'
# Create Training, Testing, and (optionally: Validation) Directories -> test/ train/
# Within Each TTV Directory, Create /images (.jpegs) and /labels (.txt) to train the YOLO model


# Source 1: Trees Data
# Autoarborist Trees Images Directory (All Autoarborist tree images)
autoarb_image_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_full/train/images/'
# Autoarborist Trees Labels Directory (All Autoarborist tree labels)
autoarb_label_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_full/train/labels/'


# Source 2: Ailanthus Data
# Autoarborist Ailanthus Images Directory
autoarb_ailanthus_image_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/all_images/'
# Autoarborist Ailanthus Labels Directory
autoarb_ailanthus_label_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/all_labels/'


# Target: YoloV5 Datasets with Trees and Ailanthus Data
#### Training Directories
# Target image directory
yolo_training_image_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/train/images/'
# Target label directory
yolo_training_label_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/train/labels/'


#### Testing Directories
# Target image directory
yolo_testing_image_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/test/images/'
# Target label directory
yolo_testing_label_directory = f'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_googlestreetview_images_labels_7300ailanthus_22000trees_jan324/test/labels/'


# Get a list of all image files in the source directories
trees_image_files = [filename for filename in os.listdir(autoarb_image_directory) if filename.endswith((".jpg", ".jpeg"))]
ailanthus_image_files = [filename for filename in os.listdir(autoarb_ailanthus_image_directory) if filename.endswith((".jpg", ".jpeg"))]

# Shuffle the lists of image files
random.shuffle(trees_image_files)
random.shuffle(ailanthus_image_files)

# Sample an equal number of images from each source directory
selected_trees_images = random.sample(trees_image_files, num_files_per_source)
selected_ailanthus_images = random.sample(ailanthus_image_files, num_files_per_source)

# Split the selected images into training and testing sets
split_index = int(0.8 * num_files_per_source)  # 80% for training, 20% for testing
trees_training_images = selected_trees_images[:split_index]
trees_testing_images = selected_trees_images[split_index:]

ailanthus_training_images = selected_ailanthus_images[:split_index]
ailanthus_testing_images = selected_ailanthus_images[split_index:]

def copy_files(source_image_directory, source_label_directory, target_image_directory, target_label_directory, files):
    for file in files:
        # Extract the file name without extension to find the corresponding label file
        file_name_without_extension, _ = os.path.splitext(file)
        label_file = file_name_without_extension + ".txt"

        # Create source and target file paths
        source_image_path = os.path.join(source_image_directory, file)
        source_label_path = os.path.join(source_label_directory, label_file)
        target_image_path = os.path.join(target_image_directory, file)
        target_label_path = os.path.join(target_label_directory, label_file)

        #print(source_image_path)
        #print(source_label_path)
        #print(target_image_path)
        #print(target_label_path)
        
        # Check if the label file exists before copying
        if os.path.exists(source_label_path):
            if os.path.exists(source_image_path):
                #print("Path Exists, Copying Data")
                # Copy matching image and label files to the target directories
                shutil.copy2(source_image_path, target_image_path)
                shutil.copy2(source_label_path, target_label_path)
        else:
            pass
            #print(f"Label file not found for {file}. Skipping.")


# Training Data
# Copy trees training images and labels
#copy_files(autoarb_image_directory, autoarb_label_directory, yolo_training_image_directory, yolo_training_label_directory, trees_training_images)
#print(f"Copying {len(trees_training_images)} training images and labels of Trees: complete.")

# Copy ailanthus training images and labels
copy_files(autoarb_ailanthus_image_directory, autoarb_ailanthus_label_directory, yolo_training_image_directory, yolo_training_label_directory, ailanthus_training_images)
print(f"Copying {len(ailanthus_training_images)} training images and labels of Ailanthus: complete.")

# Testing Data
# Copy trees training images and labels
#copy_files(autoarb_image_directory, autoarb_label_directory, yolo_testing_image_directory, yolo_testing_label_directory, trees_testing_images)
#print(f"Copying {len(trees_testing_images)} testing images and labels of Trees: complete.")

# Copy aialanthus training images and labels
copy_files(autoarb_ailanthus_image_directory, autoarb_ailanthus_label_directory, yolo_testing_image_directory, yolo_testing_label_directory, ailanthus_testing_images)
print(f"Copying {len(ailanthus_testing_images)} testing images and labels of Ailanthus: complete.")



Copying 5739 training images and labels of Ailanthus: complete.
Copying 1435 testing images and labels of Ailanthus: complete.


In [None]:
print(len(trees_training_images))
print(len(trees_testing_images))
print(len(ailanthus_training_images))
print(len(ailanthus_testing_images))

# Experiments with Curriculum Learning

Non-randomly select Images of Trees from Autoarborist and Ailanthus (From Autoarborist) Combined into Training and Testing Directories

From available genera of trees - some genera are likely to be "easier" or "harder" to classify/misclassify as Ailanthus.

Images are randomly shuffled and sampled to avoid duplicating images in both training and testing directories.

Files are copied from their respective source directories to a common target directory containing the training and testing data.

In [None]:
# Select Specific Trees Genera From Autoarborist to Include For Training based on Difficulty of Sample (Experiment: Easy vs Hard Samples)

# 'Hard' Genera are those that are often misclassified as Ailanthus, or those with prior knowledge to be Ailanthus 'lookalikes'

# 'Easy' Genera are those that are not frequently misclassified as Ailanthus, or those with prior knowledge to not be visually similar to Ailanthus

# Ailanthus is included as an 'easy' example as it should always be included during training

genera_list = [
    'melia', 'cassia', 'jacaranda', 'cupaniopsis', 'erythrina', 'albizia', 'ceiba', 'harpephyllum',
    'schefflera', 'catalpa', 'viburnum', 'juglans', 'styphnolobium', 'koelreuteria', 'sassafras',
    'pterocarya', 'calodendrum', 'paulownia', 'ficus', 'heteromeles', 'pittosporum', 'hymenosporum',
    'castanea', 'lyonothamnus', 'feijoa', 'robinia', 'sambucus', 'gymnocladus', 'eriobotrya', 'persea',
    'maclura', 'parkinsonia', 'broussonetia', 'livistona', 'handroanthus', 'maackia', 'casimiroa',
    'lophostemon', 'psidium', 'phellodendron', 'tristania', 'grevillea', 'strelitzia', 'tabebuia',
    'sophora', 'syzygium', 'eucalyptus', 'tsuga', 'elaeagnus', 'dracaena', 'bauhinia', 'rhus', 'platanus',
    'corymbia', 'aesculus', 'carya', 'libocedrus', 'punica', 'morus', 'musa', 'xylosma', 'cotoneaster',
    'dodonaea', 'yucca', 'umbellularia', 'ilex', 'cordyline', 'magnolia', 'acacia', 'sapium', 'euonymus',
    'leucaena', 'pistacia', 'nerium', 'brachychiton', 'diospyros', 'chionanthus', 'tipuana', 'ceratonia',
    'juniperus', 'rhamnus', 'citrus', 'hibiscus', 'corylus', 'thuja', 'salix', 'callistemon', 'tristaniopsis',
    'chamaerops', 'stenocarpus', 'myoporum', 'chamaecyparis', 'araucaria', 'taxus', 'arbutus', 'gleditsia',
    'camellia', 'halesia', 'archontophoenix', 'celtis', 'alnus', 'cupressus', 'olea', 'populus', 'cornus',
    'agonis', 'laburnum', 'melaleuca', 'cladrastis', 'ostrya', 'cotinus', 'cedrus', 'cocos', 'prunus',
    'cercis', 'sorbus', 'cycas', 'taxodium', 'eugenia', 'ceanothus', 'sequoia', 'calocedrus', 'syringa',
    'schinus', 'abies', 'lagunaria', 'ligustrum', 'pseudotsuga', 'ginkgo', 'podocarpus', 'rhaphiolepis',
    'picea', 'platycladus', 'angophora', 'rhododendron', 'leptospermum', 'hamamelis', 'casuarina', 'eucommia',
    'acer', 'maytenus', 'pyrus', 'solanum', 'triadica', 'malus', 'photinia', 'zelkova', 'metasequoia',
    'lagerstroemia', 'trachycarpus', 'geijera', 'liriodendron', 'fagus', 'liquidambar', 'butia', 'betula',
    'quercus', 'afrocarpus', 'ulmus', 'cinnamomum', 'sequoiadendron', 'fraxinus', 'pyracantha', 'pinus',
    'chilopsis', 'ziziphus', 'laurus', 'washingtonia', 'jubaea', 'cryptomeria', 'amelanchier', 'phoenix',
    'escallonia', 'crataegus', 'carpinus', 'nothofagus', 'nyssa', 'stewartia', 'arctostaphylos', 'euphorbia',
    'metrosideros', 'tamarix', 'syagrus', 'brahea', 'larix', 'clerodendrum', 'oxydendrum', 'cercidiphyllum',
    'dypsis', 'caragana', 'styrax', 'tilia', 'davidia', 'parrotia'
]

data_list = [
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard', 'Hard',
    'Hard', 'Hard', 'Hard', 'Hard', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy',
    'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy', 'Easy'
]

# Combine genera and criteria list into dict
genera_difficulty_dict = dict(zip(genera_list, data_list))




In [None]:
genera_difficulty_dict

In [None]:

# Set the path to the root directory containing genera of trees with images and labels
root_dir = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/jpegs_genus_idx_label'

# Set the path to the target directory for easy genera
# This directory contains images and labels of tree genera that are "easy" to classify
target_easy_training_dir = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_3600imgstrain_5ktrees_easy_hard_images_labels_dec1123/train_easy'
target_easy_testing_dir = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_3600imgstrain_5ktrees_easy_hard_images_labels_dec1123/test_easy'

# Set the path to the target directory for hard genera
# This directory contains images and labels of tree genera that are both "easy" and "hard" to classify
target_hard_training_dir = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_3600imgstrain_5ktrees_easy_hard_images_labels_dec1123/train_easy_hard'
target_hard_testing_dir = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist_ailanthus_3600imgstrain_5ktrees_easy_hard_images_labels_dec1123/test_easy_hard'

# Define the ratio of samples for testing
test_ratio = 0.2  # 20% of the samples will be used for testing

# Define the maximum number of samples to copy to the training/testing directories per genera
max_samples_per_genera = 10

# Iterate through genera directories to sample N genera per directory for training/testing
for genus, difficulty in genera_difficulty_dict.items():
    
    print(f'{genus} is {difficulty}')
    
    # Setup directories to each genera
    genera_dir_images = os.path.join(root_dir, genus, 'images') # Directory containing genera images
    genera_dir_labels = os.path.join(root_dir, genus, 'labels') # Directory containing genera labels
    
    # Create the target directories for training and testing images and labels, with both "easy" and "hard" genera cases
    target_easy_training_images_dir = os.path.join(target_easy_training_dir, 'images')
    target_easy_training_labels_dir = os.path.join(target_easy_training_dir, 'labels')
    target_easy_testing_images_dir = os.path.join(target_easy_testing_dir, 'images')
    target_easy_testing_labels_dir = os.path.join(target_easy_testing_dir, 'labels')
    
    target_hard_training_images_dir = os.path.join(target_hard_training_dir, 'images')
    target_hard_training_labels_dir = os.path.join(target_hard_training_dir, 'labels')
    target_hard_testing_images_dir = os.path.join(target_hard_testing_dir, 'images')
    target_hard_testing_labels_dir = os.path.join(target_hard_testing_dir, 'labels')
    
    os.makedirs(target_easy_training_images_dir, exist_ok=True)
    os.makedirs(target_easy_training_labels_dir, exist_ok=True)
    os.makedirs(target_easy_testing_images_dir, exist_ok=True)
    os.makedirs(target_easy_testing_labels_dir, exist_ok=True)
    
    os.makedirs(target_hard_training_images_dir, exist_ok=True)
    os.makedirs(target_hard_training_labels_dir, exist_ok=True)
    os.makedirs(target_hard_testing_images_dir, exist_ok=True)
    os.makedirs(target_hard_testing_labels_dir, exist_ok=True)
    
    # List all label files in one genera directory
    label_files = [f for f in os.listdir(genera_dir_labels) if f.endswith('.txt')]
    
    # Shuffle the list to randomly select samples for testing
    random.shuffle(label_files)
    
    # Calculate the number of samples for testing based on the ratio
    num_testing_samples = int(len(label_files) * test_ratio)
    
    # Split the samples into training and testing sets
    training_files = label_files[num_testing_samples:]
    testing_files = label_files[:num_testing_samples]
    
    print(f'Total Training Files: {len(training_files)}')
    print(f'Total Testing Files: {len(testing_files)}')
    
    # Initialize counters for each genera
    easy_training_counter = hard_training_counter = 0
    
    # Iterate through training files and copy images and labels to training directories
    for filename in training_files: # filename is the label .txt file
        image_filename = os.path.splitext(filename)[0] + '.jpeg'
        
        # Build source and target paths for images and labels
        image_source_path = os.path.join(genera_dir_images, image_filename) #the full path to the image
        label_source_path = os.path.join(genera_dir_labels, filename) #the full path to the label
        
        # Cases where the genera is 'hard' to classify, move to the 'hard' directory
        if difficulty == 'Hard':
            # Check if the maximum number of samples has been reached for this genera
            if hard_training_counter <= max_samples_per_genera:
                # For Hard Samples, copy to only the hard directory
                _ = shutil.copy2(image_source_path, target_hard_training_images_dir)
                _ = shutil.copy2(label_source_path, target_hard_training_labels_dir)
                hard_training_counter += 1
        else:
            # Check if the maximum number of samples has been reached for this genera
            if easy_training_counter <= max_samples_per_genera:
                # For Easy samples, copy to both easy and hard training directories
                _ = shutil.copy2(image_source_path, target_easy_training_images_dir)
                _ = shutil.copy2(label_source_path, target_easy_training_labels_dir)
                _ = shutil.copy2(image_source_path, target_hard_training_images_dir)
                _ = shutil.copy2(label_source_path, target_hard_training_labels_dir)
                easy_training_counter += 1
        
    print(f'Copied Training {hard_training_counter} samples to Hard, {easy_training_counter} samples to Easy and Hard for {genus}')
        
    # Initialize counters for each genera
    easy_training_counter = hard_training_counter = 0
    
    # Iterate through testing files and copy images and labels to testing directories
    for filename in testing_files:
        image_filename = os.path.splitext(filename)[0] + '.jpeg'
        
        # Build source and target paths for images and labels
        image_source_path = os.path.join(genera_dir_images, image_filename)
        label_source_path = os.path.join(genera_dir_labels, filename)
        
        if difficulty == 'Hard':
            # Check if the maximum number of samples has been reached for this genera
            if hard_training_counter <= max_samples_per_genera:
                # For Hard samples, copy only to hard training directories
                _ = shutil.copy2(image_source_path, target_hard_testing_images_dir)
                _ = shutil.copy2(label_source_path, target_hard_testing_labels_dir)
                hard_training_counter += 1
        else:
            # Check if the maximum number of samples has been reached for this genera
            if easy_training_counter <= max_samples_per_genera:
                # For Easy samples, copy to both easy and hard training directories
                _ = shutil.copy2(image_source_path, target_easy_testing_images_dir)
                _ = shutil.copy2(label_source_path, target_easy_testing_labels_dir)
                _ = shutil.copy2(image_source_path, target_hard_testing_images_dir)
                _ = shutil.copy2(label_source_path, target_hard_testing_labels_dir)
                easy_training_counter += 1
              
    print(f'Copied Testing {hard_training_counter} samples to Hard, {easy_training_counter} samples to Easy and Hard for {genus}')
          
print("Images and labels copied successfully.")
    



In [None]:
# 

# Rename/Shuffle Image/Label Filenames


In [None]:
import os
import random
import string

def generate_random_name(length=12):
    """Generate a random alphanumeric name."""
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

def rename_files(image_directory, label_directory):
    # Iterate through .txt files in the label directory
    for filename in os.listdir(label_directory):
        if filename.endswith('.txt'):
            label_path = os.path.join(label_directory, filename)

            # Construct the path for the corresponding image file
            image_filename, _ = os.path.splitext(filename)
            image_path = os.path.join(image_directory, image_filename + '.jpeg')

            # Generate a unique random name for both image and label files
            random_name = generate_random_name()
            file_extension = os.path.splitext(image_path)[1]

            # Construct the new paths
            new_image_path = os.path.join(image_directory, random_name + file_extension)
            new_label_path = os.path.join(label_directory, random_name + '.txt')

            # Rename the image file
            os.rename(image_path, new_image_path)
            print(f'Renamed: {image_path} to {new_image_path}')

            # Rename the label file
            os.rename(label_path, new_label_path)
            print(f'Renamed: {label_path} to {new_label_path}')

# Example usage
images_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/inaturalist_ailanthus_positives_nov1323/test/images'
labels_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/inaturalist_ailanthus_positives_nov1323/test/labels'
rename_files(images_directory, labels_directory)



# Sort Labels by Bounding Box Size


In [None]:
# Summarize distribution of bounding boxes dataset


# Path to the directory containing .txt files
#txt_directory = r'C:/Users/talake2/Desktop/ailanthus_labelstudio_nov1323/labels/'
txt_directory = r'C:/Users/talake2/Desktop/auto_arborist_cvpr2022_v015/yolov5/datasets/autoarborist/test/labels/'

# Counters for boxes with width and height over 0.9
count_width_over_0_8 = 0
count_height_over_0_8 = 0

# Iterate through .txt files and modify the content
for filename in os.listdir(txt_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_directory, filename)
        with open(txt_file_path, 'r') as file:
            # Read the content of the file
            lines = file.readlines()
            
            # Iterate through lines in the file
            for line in lines:
                values = line.split()
                
                # Get bounding box data from each box
                obj_class = int(values[0])  # Assuming class is an integer
                x_value = float(values[1])
                y_value = float(values[2])
                w_value = float(values[3])
                h_value = float(values[4])
                
                # Check conditions for width and height
                if w_value > 0.8:
                    count_width_over_0_8 += 1
                
                if h_value > 0.8:
                    count_height_over_0_8 += 1
                    

# Print the results
print(f"Number of boxes with width over 0.8: {count_width_over_0_8}")
print(f"Number of boxes with height over 0.8: {count_height_over_0_8}")
            

            



In [None]:
# Remove labels for images with bounding boxes > 0.95 width and height

# Path to the directory containing .txt files
txt_directory = r'C:/Users/talake2/Desktop/ailanthus_labelstudio_nov1323/labels/'

# Temporary list to store files to be removed
files_to_remove = []

# Iterate through .txt files and identify files to be removed
for filename in os.listdir(txt_directory):
    if filename.endswith('.txt'):
        txt_file_path = os.path.join(txt_directory, filename)
        with open(txt_file_path, 'r') as file:
            # Read the content of the file
            lines = file.readlines()
            
            # Check conditions for width and height
            for line in lines:
                values = line.split()
                w_value = float(values[3])
                h_value = float(values[4])
                
                if w_value > 0.5: #if bounding box is very wide, add as candidate to remove
                    files_to_remove.append(txt_file_path)
                
                if h_value > 0.5: #if bounding box is very tall, add as candidate to remove
                    files_to_remove.append(txt_file_path)
                    break




In [None]:
# Remove the identified files

for file_to_remove in files_to_remove:
    os.remove(file_to_remove)

# Print the removed files
print(f"Removed {len(files_to_remove)} files.")