In [2]:
import wandb

In [3]:
wandb.init(project="AgroSkyAI")
config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mobidarefolu[0m ([33mfoluobidare[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [1]:
import random
import os
import cv2
from PIL import Image

In [2]:
def copy_image(source_file, destination_dir, new_name="", format='JPEG'):
    """
    Converts an image to the specified format and saves it with a new name.
    
    Args:
        source_file (str): The path to the source image file.
        destination_dir (str): The directory where the image will be saved.
        new_name (str): The new name for the saved image file (should have the correct extension).
        format (str): The format to convert to ('JPEG' or 'PNG').
    """
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)
    
    try:
        with Image.open(source_file) as img:
            new_file_path = os.path.join(destination_dir, new_name)
            img.save(new_file_path, format=format)
    except Exception as e:
        print(f"Error converting {source_file}: {e}")

In [3]:
"""
barley - 0
soy - 1
sunfolwer - 2
wheat - 3
image id, class, bbox string, 
"""

'\nbarley - 0\nsoy - 1\nsunfolwer - 2\nwheat - 3\nimage id, class, bbox string, \n'

In [4]:
data_root_directory = "../../pre_data/"

# Wheat dataset
# gwd_dataset = data_root_directory + "wheat/global-wheat-detection/"
gwdh_2021 = data_root_directory + "wheat/gwhd_2021/"

# sunflower dataset
pcvp_dataset = data_root_directory + "sunflower/pcvp/"
scvp_dataset = data_root_directory + "sunflower/scvp/"

# soybean dataset
soynet_dataset = data_root_directory + "soybean/soynet/"
bycbh73438_soy_dataset = data_root_directory + "soybean/bycbh73438-1/"

target_directory = "../data/"

In [5]:
# 300 images from gwdh_2021 
# 200 images from scvp_dataset, 100 images from pcvp_dataset
# 100 images from soynet_dataset, 100 images from bycbh73438_soy_dataset

# labels and boxes will be saved in a csv file with bboxes having the coco format [x_min, y_min, width, height]
# csv format will be image_name, class, box string

dataset_files_counter = 0
predict_files_counter = [0]

In [6]:
qq = "a;b;c;".split(";")
qq

['a', 'b', 'c', '']

In [7]:
def make_new_dataset(dataset, data_dict, file_counter):
    def find_image_path(dataset, image_name):
        possible_dirs = ["images/", "train/images/", "test/images/", "valid/images/"]
        
        for directory in possible_dirs:
            source_file_path = os.path.join(dataset, directory, image_name)
            if os.path.exists(source_file_path):
                return source_file_path
        return None
        
    for i in range(len(data_dict['image_names'])):
        new_file_name = "image_" + str(file_counter)

        source_file_path = find_image_path(dataset, data_dict['image_names'][i])
        
        # source_file_path = dataset + "images/" + data_dict['image_names'][i]
        destination_directory = target_directory + "images/"
        
        copy_image(source_file_path, destination_directory, new_file_name+".png", format='PNG')
        
        

        with open(target_directory+"labels/"+new_file_name+".csv", "a") as f:
            f.write(new_file_name+".png,")
            f.write(str(data_dict['classes'][i])+",")
            for index, bbox in enumerate(data_dict['bboxes'][i]):
                for j in range(len(bbox)):
                    if j == 3 and not index == len(data_dict['bboxes'][i]) - 1:
                        f.write(str(bbox[j])+";")
                    else:
                        f.write(str(bbox[j])+" ")
            f.write("\n")
            
        file_counter += 1
            
    return file_counter

In [8]:
# picking images from gwdh_2021
def pick_gwdh_images(file, size):
    random_images = None
    with open(file, 'r') as f:
        lines = f.readlines()[1:]
        random_images = random.sample(lines, size)
    
    labels_dict = {
        'image_names': [],
        'classes': [],
        'bboxes': []
    }

    for image in random_images:
        info = image.split(",")
        image_name = info[0]
        bboxes_string = info[1]
        bboxes_split = bboxes_string.split(";")
        for i, bbox_string in enumerate(bboxes_split):
            bboxes_split[i] = bbox_string.split()
            bboxes_split[i] = [int(x) if x.isdigit() else x for x in bboxes_split[i]]
        bboxes_split = tuple(bboxes_split)
        
        labels_dict['image_names'].append(image_name)
        labels_dict['classes'].append(3)
        labels_dict['bboxes'].append(bboxes_split)

    assert len(labels_dict['image_names']) == len(labels_dict['classes']) == len(labels_dict['bboxes']), "ERROR in parsing"
    return labels_dict
        
gwdh_2021_test_images_file = gwdh_2021 + "competition_test.csv"
gwdh_2021_val_images_file = gwdh_2021 + "competition_val.csv"
gwdh_2021_train_images_file = gwdh_2021 + "competition_train.csv"


gwdh_2021_train_parsed = pick_gwdh_images(gwdh_2021_train_images_file, 210)
gwdh_2021_test_parsed = pick_gwdh_images(gwdh_2021_test_images_file, 60)
gwdh_2021_val_parsed = pick_gwdh_images(gwdh_2021_val_images_file, 30)

gwdh_2021_train_parsed['image_names'].extend(gwdh_2021_test_parsed['image_names'])
gwdh_2021_train_parsed['classes'].extend(gwdh_2021_test_parsed['classes'])
gwdh_2021_train_parsed['bboxes'].extend(gwdh_2021_test_parsed['bboxes'])

gwdh_2021_train_parsed['image_names'].extend(gwdh_2021_val_parsed['image_names'])
gwdh_2021_train_parsed['classes'].extend(gwdh_2021_val_parsed['classes'])
gwdh_2021_train_parsed['bboxes'].extend(gwdh_2021_val_parsed['bboxes'])

wheat_data_dict = gwdh_2021_train_parsed
assert len(wheat_data_dict['image_names']) == 300, "ERROR in parsing"

In [11]:
dataset_files_counter = make_new_dataset(gwdh_2021, wheat_data_dict, dataset_files_counter)
print(dataset_files_counter)

300


In [12]:
# picking images from pcvp_dataset
def convert_yolov7_to_coco_boxes(label_file, image_width, image_height):
    """
    Converts YOLOv7 format labels to COCO format bounding boxes.
    
    Args:
        label_file (str): Path to the YOLOv7 format label file.
        image_width (int): Width of the original image.
        image_height (int): Height of the original image.
    
    Returns:
        list: List of bounding boxes in COCO format.
    """
    coco_boxes = []
    
    with open(label_file, 'r') as f:
        for line in f:
            data = line.strip().split()
            class_id = int(data[0])
            if len(data) == 5:
                # YOLOv7 format bounding box
                center_x = float(data[1]) * image_width
                center_y = float(data[2]) * image_height
                width = float(data[3]) * image_width
                height = float(data[4]) * image_height
                
                # Convert from center coordinates to top-left coordinates
                x = center_x - width / 2
                y = center_y - height / 2
                
                coco_boxes.append([x, y, width, height])
            else:
                # Segmentation mask, convert to bounding box
                points = [float(p) for p in data[1:]]
                x_coords = points[0::2]  # Extract x coordinates
                y_coords = points[1::2]  # Extract y coordinates
                
                x_min = min(x_coords) * image_width
                y_min = min(y_coords) * image_height
                x_max = max(x_coords) * image_width
                y_max = max(y_coords) * image_height
                
                width = x_max - x_min
                height = y_max - y_min
                
                coco_boxes.append([x_min, y_min, width, height])
    
    return coco_boxes


def extract_sunflower_files(directory, sub_directory, pcvp=True, counter=None):
    labels_dict = {
        'image_names': [],
        'classes': [],
        'bboxes': []
    }
    
    new_directory = os.path.join(directory, sub_directory)
    for filename in os.listdir(new_directory):
        file = os.path.join(new_directory, filename)
        try:
            with open(file, 'r') as f:
                contents = f.readlines()
                
                if not contents or len(contents) == 0:
                    # File is empty of malformed
                    if not pcvp:
                        new_file_name = "image_" + str(counter[0])
                        destination_directory = target_directory + "predict/"
                        copy_image(directory+"images/" + filename[:-3] +"jpg", destination_directory, new_file_name+".png", format='PNG')
                        counter[0] += 1
                    continue

                if pcvp and contents[0][0] == "3" or not pcvp:
                    # print(filename)
                    
                    image_file = os.path.join(directory, "images", filename[:-3] + "jpg")
                    image = cv2.imread(image_file)
                    if image is None:
                        print(f"Image {image_file} not found or unable to read.")
                        continue
                    
                    height, width, channels = image.shape
                    bboxes = convert_yolov7_to_coco_boxes(file, width, height)
                    
                    labels_dict['image_names'].append(filename[:-3]+"jpg")
                    labels_dict['classes'].append(3)
                    labels_dict['bboxes'].append(bboxes)
                    
                    # Ensure lengths are equal
                    if len(labels_dict['classes']) != len(labels_dict['image_names']) or len(labels_dict['classes']) != len(labels_dict['bboxes']):
                        print("Length mismatch found!")
                        break
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue

    return labels_dict


# 198 sunflower images in total in pvcp dataset
pcvp_test_images_dir = pcvp_dataset + "test/"
pcvp_val_images_dir = pcvp_dataset + "valid/"
pcvp_train_images_dir = pcvp_dataset + "train/"


pcvp_train_parsed = extract_sunflower_files(pcvp_train_images_dir, "labels")
pcvp_test_parsed = extract_sunflower_files(pcvp_test_images_dir, "labels")
pcvp_val_parsed = extract_sunflower_files(pcvp_val_images_dir, "labels")

pcvp_train_parsed['image_names'].extend(pcvp_test_parsed['image_names'])
pcvp_train_parsed['classes'].extend(pcvp_test_parsed['classes'])
pcvp_train_parsed['bboxes'].extend(pcvp_test_parsed['bboxes'])

pcvp_train_parsed['image_names'].extend(pcvp_val_parsed['image_names'])
pcvp_train_parsed['classes'].extend(pcvp_val_parsed['classes'])
pcvp_train_parsed['bboxes'].extend(pcvp_val_parsed['bboxes'])

pvcp_data_dict = pcvp_train_parsed

assert len(pvcp_data_dict['image_names']) == len(pvcp_data_dict['classes']) == len(pvcp_data_dict['bboxes']) == 198, "ERROR in parsing"
# assert len(pvcp_data_dict['image_names']) == len(pvcp_data_dict['labels']) == len(pvcp_data_dict['bboxes']) == 198, "ERROR in parsing"

In [13]:
len(pvcp_data_dict['image_names'])

198

In [14]:
scvp_test_images_dir = scvp_dataset + "test/"
scvp_train_images_dir = scvp_dataset + "train/"

scvp_train_parsed = extract_sunflower_files(scvp_test_images_dir, "labels", False, predict_files_counter)
scvp_test_parsed = extract_sunflower_files(scvp_train_images_dir, "labels", False, predict_files_counter)

scvp_train_parsed['image_names'].extend(scvp_test_parsed['image_names'])
scvp_train_parsed['classes'].extend(scvp_test_parsed['classes'])
scvp_train_parsed['bboxes'].extend(scvp_test_parsed['bboxes'])


svcp_data_dict = scvp_train_parsed

In [15]:
len(svcp_data_dict['image_names'])

117

In [19]:
dataset_files_counter = make_new_dataset(pcvp_dataset, pvcp_data_dict, dataset_files_counter)
print(dataset_files_counter)

498


In [20]:
dataset_files_counter = make_new_dataset(scvp_dataset, svcp_data_dict, dataset_files_counter)
print(dataset_files_counter)

615


In [54]:
# dataset_files_counter = 300