In [1]:
import os
import random
import cv2
from PIL import Image, ImageOps
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
# Paths
data_root_directory = "../../pre_data/"
target_directory = "../data/"

# Datasets
datasets = {
    'wheat': data_root_directory + "wheat/gwhd_2021/",
    'sunflower_pcvp': data_root_directory + "sunflower/pcvp/",
    'sunflower_scvp': data_root_directory + "sunflower/scvp/",
    'soy': data_root_directory + "soy_picked/"
}

# Create output directories
os.makedirs(target_directory + "images", exist_ok=True)
os.makedirs(target_directory + "labels", exist_ok=True)
os.makedirs(target_directory + "predict", exist_ok=True)

In [3]:
classes = {
    "barley": 0,
    "soy": 1,
    "sunfolwer": 2,
    "wheat": 3
}

In [4]:
# Function to copy and resize images
def copy_and_resize_image(source_file, destination_dir, new_name, new_size=None, format='JPEG'):
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    
    try:
        with Image.open(source_file) as img:
            fixed_image = ImageOps.exif_transpose(img)
            if new_size is not None:
                fixed_image = fixed_image.resize(new_size, Image.LANCZOS)
            new_file_path = os.path.join(destination_dir, new_name)
            fixed_image.save(new_file_path, format=format)
    except Exception as e:
        print(f"Error converting {source_file}: {e}")


# Function to parse XML files
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    boxes = []
    for obj in root.findall('object'):
        bndbox = obj.find('bndbox')
        x_min = int(bndbox.find('xmin').text)
        y_min = int(bndbox.find('ymin').text)
        x_max = int(bndbox.find('xmax').text)
        y_max = int(bndbox.find('ymax').text)
        
        width = x_max - x_min
        height = y_max - y_min
        boxes.append((x_min, y_min, width, height))
    
    return boxes


# Function to adjust bounding boxes
def adjust_bboxes(bboxes, original_size, new_size):
    y_ratio = new_size[1] / original_size[1]
    x_ratio = new_size[0] / original_size[0]
    
    new_bboxes = []
    for bbox in bboxes:
        x_min, y_min, width, height = bbox
        original_x2, original_y2 = x_min + width, y_min + height
        new_x_min = int(x_min * x_ratio)
        new_y_min = int(y_min * y_ratio)
        new_width = int(width * x_ratio)
        new_height = int(height * y_ratio)
        new_bboxes.append((new_x_min, new_y_min, new_width, new_height))
    
    return new_bboxes


# Function to create the new dataset
def make_new_dataset(dataset, data_dict, file_counter, new_size=None):
    def find_image_path(dataset, image_name):
        possible_dirs = ["images/", "train/images/", "test/images/", "valid/images/", ""]
        
        for directory in possible_dirs:
            source_file_path = os.path.join(dataset, directory, image_name)
            if os.path.exists(source_file_path):
                return source_file_path
        return None
    
    for i in range(len(data_dict['image_names'])):
        new_file_name = "image_" + str(file_counter)
        source_file_path = find_image_path(dataset, data_dict['image_names'][i])
        destination_directory = os.path.join(target_directory, "images")
        
        # Resize image and adjust bounding boxes
        copy_and_resize_image(source_file_path, destination_directory, new_file_name + ".png", new_size, format='PNG')
        
        # Adjust bounding boxes
        with Image.open(source_file_path) as img:
            original_size = img.size
        
        new_bboxes = adjust_bboxes(data_dict['bboxes'][i], original_size, new_size) if new_size is not None else data_dict['bboxes'][i]
        
        # Save labels to CSV
        with open(target_directory + "labels/" + new_file_name + ".csv", "a") as f:
            f.write(new_file_name + ".png,")
            f.write(str(data_dict['classes'][i]) + ",")
            for index, bbox in enumerate(new_bboxes):
                for j in range(len(bbox)):
                    if j == 3 and index != len(new_bboxes) - 1:
                        f.write(str(bbox[j]) + ";")
                    else:
                        f.write(str(bbox[j]) + " ")
            f.write("\n")
        
        file_counter += 1
    
    return file_counter

# Function to pick random images from a list of lines
def pick_wheat_images(file, size):
    with open(file, 'r') as f:
        lines = f.readlines()[1:]
        random_images = random.sample(lines, size)
    
    labels_dict = {'image_names': [], 'classes': [], 'bboxes': []}

    for image in random_images:
        info = image.strip().split(",")
        image_name = info[0]
        bboxes_string = info[1]
        
        try:
            bboxes_split = [list(map(int, bbox.split())) for bbox in bboxes_string.split(";")]
            for bbox in bboxes_split:
                bbox[2] = bbox[2] - bbox[0]
                bbox[3] = bbox[3] - bbox[1]
        except:
            bboxes_split = []
        
        labels_dict['image_names'].append(image_name)
        labels_dict['classes'].append(classes['wheat'])
        labels_dict['bboxes'].append(bboxes_split)

    return labels_dict

# Function to convert YOLO format to COCO format
def convert_yolov7_to_coco_boxes(label_file, image_width, image_height):
    coco_boxes = []
    with open(label_file, 'r') as f:
        for line in f:
            data = line.strip().split()
            if len(data) == 5:
                center_x, center_y = float(data[1]) * image_width, float(data[2]) * image_height
                width, height = float(data[3]) * image_width, float(data[4]) * image_height
                x = center_x - width / 2
                y = center_y - height / 2
                coco_boxes.append((x, y, width, height))
            else:
                points = [float(p) for p in data[1:]]
                x_coords, y_coords = points[0::2], points[1::2]
                x_min, y_min, x_max, y_max = min(x_coords) * image_width, min(y_coords) * image_height, max(x_coords) * image_width, max(y_coords) * image_height
                width, height = x_max - x_min, y_max - y_min
                coco_boxes.append((x_min, y_min, width, height))
    return coco_boxes



In [5]:
# Function to extract soy files
def extract_soy_files(directory_path):
    labels_dict = {'image_names': [], 'classes': [], 'bboxes': []}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.xml'):
            file_path = os.path.join(directory_path, filename)
            bboxes = parse_xml(file_path)
            labels_dict['image_names'].append(filename[:-3]+"jpg")
            labels_dict['classes'].append(classes['soy'])
            labels_dict['bboxes'].append(bboxes)
    
    return labels_dict


# Function to extract sunflower files
def extract_sunflower_files(directory, sub_directory, pcvp=True, counter=None, new_size=None):
# def extract_sunflower_files(directory, sub_directory, pcvp=True, counter=None):
    labels_dict = {
        'image_names': [],
        'classes': [],
        'bboxes': []
    }
    
    new_directory = os.path.join(directory, sub_directory)
    for filename in os.listdir(new_directory):
        file = os.path.join(new_directory, filename)
        try:
            with open(file, 'r') as f:
                contents = f.readlines()
                
                if not contents or len(contents) == 0:
                    # File is empty of malformed
                    if not pcvp:
                        new_file_name = "image_" + str(counter[0])
                        destination_directory = target_directory + "predict/"
                        copy_and_resize_image(directory+"images/" + filename[:-3] +"jpg", destination_directory, new_file_name+".png", new_size, format='PNG')
                        counter[0] += 1
                    continue

                if pcvp and contents[0][0] == "3" or not pcvp:
                    # print(filename)
                    
                    image_file = os.path.join(directory, "images", filename[:-3] + "jpg")
                    image = cv2.imread(image_file)
                    if image is None:
                        print(f"Image {image_file} not found or unable to read.")
                        continue
                    
                    height, width, channels = image.shape
                    bboxes = convert_yolov7_to_coco_boxes(file, width, height)
                    
                    labels_dict['image_names'].append(filename[:-3]+"jpg")
                    labels_dict['classes'].append(2)
                    labels_dict['bboxes'].append(bboxes)
                    
                    # Ensure lengths are equal
                    if len(labels_dict['classes']) != len(labels_dict['image_names']) or len(labels_dict['classes']) != len(labels_dict['bboxes']):
                        print("Length mismatch found!")
                        break
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue

    return labels_dict

In [10]:
# yolov7 input image size
# new_size = (448, 448)

dataset_files_counter = 0
predict_files_counter = [0]

In [11]:
# Process wheat dataset
wheat_train_images_file = os.path.join(datasets['wheat'], "competition_train.csv")
wheat_test_images_file = os.path.join(datasets['wheat'], "competition_test.csv")
wheat_val_images_file = os.path.join(datasets['wheat'], "competition_val.csv")

wheat_train_dict = pick_wheat_images(wheat_train_images_file, 300)
dataset_files_counter = make_new_dataset(datasets['wheat'], wheat_train_dict, dataset_files_counter)
print(f"Wheat train dataset processed. Total files: {dataset_files_counter}")

wheat_val_dict = pick_wheat_images(wheat_val_images_file, 100)
dataset_files_counter = make_new_dataset(datasets['wheat'], wheat_val_dict, dataset_files_counter)
print(f"Wheat validation dataset processed. Total files: {dataset_files_counter}")

wheat_test_dict = pick_wheat_images(wheat_test_images_file, 100)
dataset_files_counter = make_new_dataset(datasets['wheat'], wheat_test_dict, dataset_files_counter)
print(f"Wheat test dataset processed. Total files: {dataset_files_counter}")

Wheat train dataset processed. Total files: 300
Wheat validation dataset processed. Total files: 400
Wheat test dataset processed. Total files: 500


In [12]:
# Process soy dataset
soy_data_dict = extract_soy_files(os.path.join(datasets['soy'], "labels"))
dataset_files_counter = make_new_dataset(datasets['soy'], soy_data_dict, dataset_files_counter)
print(f"Soy dataset processed. Total files: {dataset_files_counter}")

Soy dataset processed. Total files: 606


In [13]:
# Process sunflower datasets
pcvp_train_parsed = extract_sunflower_files(datasets['sunflower_pcvp']+"train/", "labels", pcvp=True, counter=predict_files_counter)
pcvp_test_parsed = extract_sunflower_files(datasets['sunflower_pcvp']+"test/", "labels", pcvp=True, counter=predict_files_counter)
pcvp_val_parsed = extract_sunflower_files(datasets['sunflower_pcvp']+"valid/", "labels", pcvp=True, counter=predict_files_counter)

pcvp_train_parsed['image_names'].extend(pcvp_test_parsed['image_names'])
pcvp_train_parsed['classes'].extend(pcvp_test_parsed['classes'])
pcvp_train_parsed['bboxes'].extend(pcvp_test_parsed['bboxes'])

pcvp_train_parsed['image_names'].extend(pcvp_val_parsed['image_names'])
pcvp_train_parsed['classes'].extend(pcvp_val_parsed['classes'])
pcvp_train_parsed['bboxes'].extend(pcvp_val_parsed['bboxes'])

sunflower_pcvp_dict = pcvp_train_parsed

dataset_files_counter = make_new_dataset(datasets['sunflower_pcvp'], sunflower_pcvp_dict, dataset_files_counter)
print(f"Sunflower PCVP dataset processed. Total files: {dataset_files_counter}")

Sunflower PCVP dataset processed. Total files: 804


In [14]:
scvp_train_parsed = extract_sunflower_files(datasets['sunflower_scvp']+"train/", "labels", pcvp=False, counter=predict_files_counter)
scvp_test_parsed = extract_sunflower_files(datasets['sunflower_scvp']+"test/", "labels", pcvp=False, counter=predict_files_counter)

scvp_train_parsed['image_names'].extend(scvp_test_parsed['image_names'])
scvp_train_parsed['classes'].extend(scvp_test_parsed['classes'])
scvp_train_parsed['bboxes'].extend(scvp_test_parsed['bboxes'])

sunflower_scvp_dict = scvp_train_parsed
dataset_files_counter = make_new_dataset(datasets['sunflower_scvp'], sunflower_scvp_dict, dataset_files_counter)

print(f"Sunflower SCVP dataset processed. Total files: {dataset_files_counter}")
print(f"Sunflower SCVP predict files: {predict_files_counter[0]}")

Sunflower SCVP dataset processed. Total files: 921
Sunflower SCVP predict files: 643
