In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install roboflow

Collecting roboflow
  Downloading roboflow-1.1.49-py3-none-any.whl.metadata (9.7 kB)
Collecting idna==3.7 (from roboflow)
  Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting python-dotenv (from roboflow)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting filetype (from roboflow)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading roboflow-1.1.49-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.9/80.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: filetype, python-dotenv, idna, roboflow
  Attempting uninstall: idna
    Found existing installation: idna 3.10
    

In [3]:
import os
import cv2
import numpy as np
import pandas as pd
import yaml
import plotly.express as px
from tqdm import tqdm

from roboflow import Roboflow

from drive.MyDrive.roboflow_configs.nk39_config import (
    API_KEY,
    WORKSPACE,
    UA_FIELDS_EVENING1_PROJECT_ID,
    UA_FIELDS_HEALTHY_DEFICIENCY_VERSION
  )

In [4]:
rf = Roboflow(api_key=API_KEY)
project = rf.workspace(WORKSPACE).project(UA_FIELDS_EVENING1_PROJECT_ID)
version = project.version(UA_FIELDS_HEALTHY_DEFICIENCY_VERSION)
dataset = version.download("yolov8")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in ukrainian_fields_evening1-1 to yolov8:: 100%|██████████| 6982/6982 [00:00<00:00, 28928.72it/s]





Extracting Dataset Version Zip to ukrainian_fields_evening1-1 in yolov8:: 100%|██████████| 220/220 [00:00<00:00, 3210.13it/s]


In [5]:
# !rm -r /content/Tomato-Leaf-Diseases-3

In [6]:
# Step 1: Load class names from data.yaml
yaml_file = "/content/ukrainian_fields_evening1-1/data.yaml"
with open(yaml_file, 'r') as f:
    data_yaml = yaml.safe_load(f)

class_names = data_yaml['names']
num_classes = data_yaml['nc']

# Step 2: Define base directory and load image paths
base_dir = "/content/ukrainian_fields_evening1-1"
data = []

# Define function to calculate image entropy
def calculate_entropy(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Load image in grayscale
    if image is None:
        return None
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])  # Calculate histogram
    hist_prob = hist / hist.sum()  # Normalize histogram to probabilities
    entropy = -np.sum(hist_prob * np.log2(hist_prob + 1e-10))  # Calculate entropy
    return entropy

# Step 3: Iterate through each split (train, valid, test) and calculate entropy
for split in ['train', 'valid', 'test']:
    images_dir = os.path.join(base_dir, split, 'images')
    labels_dir = os.path.join(base_dir, split, 'labels')

    if not os.path.exists(images_dir):
        continue  # Skip missing directories

    for image_file in os.listdir(images_dir):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(images_dir, image_file)
            label_path = os.path.join(labels_dir, image_file.rsplit('.', 1)[0] + '.txt')

            # Calculate entropy for the image
            entropy = calculate_entropy(image_path)

            # Extract class IDs from label file
            class_labels = []
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    label_data = f.readlines()
                # Extract unique class IDs from label data
                class_labels = list(set([int(line.split()[0]) for line in label_data]))

            # For each class in the image, record the entropy value
            for class_id in class_labels:
                data.append({
                    'image_path': image_path,
                    'class_label': class_names[class_id],
                    'entropy': entropy
                })

# Step 4: Create DataFrame from collected data
dataset_df = pd.DataFrame(data)
print(f"Total records: {len(dataset_df)}")
print("Sample data:")
print(dataset_df.head())

# Step 5: Visualize entropy distribution using Plotly histogram
fig = px.histogram(
    dataset_df,
    x='entropy',
    color='class_label',
    title="Histogram Distribution of Image Complexity by Class (UAFieldsEvening1 - YOLOv8 format)",
    labels={'entropy': 'Image Entropy', 'class_label': 'Class'},
    nbins=50,  # Adjust bins for granularity
    barmode='overlay',  # Bars are overlaid
    opacity=0.75,  # Transparency for overlapping bars
    hover_data={'class_label': True, 'entropy': ':.2f'}  # Hover info
)
fig.update_layout(
    xaxis_title="Image Entropy",
    yaxis_title="Count",
    legend_title="Class"
)
fig.show()


Total records: 128
Sample data:
                                          image_path  class_label   entropy
0  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency  6.839197
1  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency  6.820806
2  /content/ukrainian_fields_evening1-1/train/ima...  Late Blight  7.775542
3  /content/ukrainian_fields_evening1-1/train/ima...      Healthy  6.810052
4  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency  7.299069


In [7]:
data = []

# Define function to calculate image brightness (average intensity)
def calculate_brightness(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Load image in grayscale
    if image is None:
        return None
    brightness = np.mean(image)  # Average pixel intensity
    return brightness

# Define function to calculate image contrast (standard deviation of intensity)
def calculate_contrast(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return None
    contrast = np.std(image)  # Standard deviation of pixel intensity
    return contrast

# Step 3: Iterate through each split (train, valid, test) and calculate lighting conditions
for split in ['train', 'valid', 'test']:
    images_dir = os.path.join(base_dir, split, 'images')
    labels_dir = os.path.join(base_dir, split, 'labels')

    if not os.path.exists(images_dir):
        continue  # Skip missing directories

    for image_file in os.listdir(images_dir):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(images_dir, image_file)
            label_path = os.path.join(labels_dir, image_file.rsplit('.', 1)[0] + '.txt')

            # Calculate brightness and contrast for the image
            brightness = calculate_brightness(image_path)
            contrast = calculate_contrast(image_path)

            # Extract class IDs from label file
            class_labels = []
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    label_data = f.readlines()
                # Extract unique class IDs from label data
                class_labels = list(set([int(line.split()[0]) for line in label_data]))

            # For each class in the image, record the brightness, contrast, and class label
            for class_id in class_labels:
                data.append({
                    'image_path': image_path,
                    'class_label': class_names[class_id],
                    'brightness': brightness,
                    'contrast': contrast
                })

# Step 4: Create DataFrame from collected data
dataset_df = pd.DataFrame(data)
print(f"Total records: {len(dataset_df)}")
print("Sample data:")
print(dataset_df.head())

# Step 5: Visualize brightness and contrast distribution using Plotly histograms
fig1 = px.histogram(
    dataset_df,
    x='brightness',
    color='class_label',
    title="Brightness Distribution of Images by Class(UAEvening1)",
    labels={'brightness': 'Average Brightness', 'class_label': 'Class'},
    nbins=50,
    barmode='overlay',
    opacity=0.75,
    hover_data={'class_label': True, 'brightness': ':.2f'}
)
fig1.show()

fig2 = px.histogram(
    dataset_df,
    x='contrast',
    color='class_label',
    title="Contrast Distribution of Images by Class(UAEvening1)",
    labels={'contrast': 'Image Contrast', 'class_label': 'Class'},
    nbins=50,
    barmode='overlay',
    opacity=0.75,
    hover_data={'class_label': True, 'contrast': ':.2f'}
)
fig2.show()


Total records: 128
Sample data:
                                          image_path  class_label  brightness  \
0  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency   53.743645   
1  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency   56.841677   
2  /content/ukrainian_fields_evening1-1/train/ima...  Late Blight   98.212473   
3  /content/ukrainian_fields_evening1-1/train/ima...      Healthy   57.162925   
4  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency  105.985007   

    contrast  
0  29.517369  
1  32.511219  
2  62.524852  
3  34.194170  
4  73.122110  


In [8]:
data = []

# Function to load YOLOv8 labels (bounding box data)
def load_yolo_labels(label_path): # TODO: find out why result is so strange
    with open(label_path, 'r') as f:
        labels = f.readlines()
    return labels

# Function to calculate foreground-to-background ratio
def calculate_fg_bg_ratio(image_path, label_path):
    image = cv2.imread(image_path)
    if image is None:
        return None

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to create a binary mask
    _, binary_mask = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)

    # Parse the YOLOv8 label file to get the bounding boxes
    labels = load_yolo_labels(label_path)
    for label in labels:
        # Extract the bounding box from the label (class x_center y_center width height)
        parts = label.split()
        x_center, y_center, w, h = float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4])

        # Convert the normalized coordinates to pixel values
        height, width = image.shape[:2]
        x_min = int((x_center - w / 2) * width)
        x_max = int((x_center + w / 2) * width)
        y_min = int((y_center - h / 2) * height)
        y_max = int((y_center + h / 2) * height)

        # Set the bounding box area as foreground
        binary_mask[y_min:y_max, x_min:x_max] = 255  # Set the bounding box area to foreground

    # Calculate the foreground-to-background ratio
    foreground_area = np.sum(binary_mask == 255)
    background_area = np.sum(binary_mask == 0)
    total_area = binary_mask.size
    fg_bg_ratio = foreground_area / background_area if background_area > 0 else 0

    return fg_bg_ratio

# Step 3: Iterate through each split (train, valid, test) and calculate foreground-to-background ratio
for split in ['train', 'valid', 'test']:
    images_dir = os.path.join(base_dir, split, 'images')
    labels_dir = os.path.join(base_dir, split, 'labels')

    if not os.path.exists(images_dir):
        continue  # Skip missing directories

    for image_file in os.listdir(images_dir):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(images_dir, image_file)
            label_path = os.path.join(labels_dir, image_file.rsplit('.', 1)[0] + '.txt')

            # Calculate foreground-to-background ratio for the image
            fg_bg_ratio = calculate_fg_bg_ratio(image_path, label_path)

            # Extract class IDs from label file
            class_labels = []
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    label_data = f.readlines()
                # Extract unique class IDs from label data
                class_labels = list(set([int(line.split()[0]) for line in label_data]))

            # For each class in the image, record the foreground-to-background ratio value
            for class_id in class_labels:
                data.append({
                    'image_path': image_path,
                    'class_label': class_names[class_id],
                    'fg_bg_ratio': fg_bg_ratio
                })

# Step 4: Create DataFrame from collected data
dataset_df = pd.DataFrame(data)
print(f"Total records: {len(dataset_df)}")
print("Sample data:")
print(dataset_df.head())

# Step 5: Visualize foreground-to-background ratio distribution using Plotly histogram
fig = px.histogram(
    dataset_df,
    x='fg_bg_ratio',
    color='class_label',
    title="Histogram Distribution of Foreground-to-Background Ratios by Class (YOLOv8 format)",
    labels={'fg_bg_ratio': 'Foreground-to-Background Ratio', 'class_label': 'Class'},
    nbins=50,  # Adjust bins for granularity
    barmode='overlay',  # Bars are overlaid
    opacity=0.75,  # Transparency for overlapping bars
    hover_data={'class_label': True, 'fg_bg_ratio': ':.2f'}  # Hover info
)
fig.update_layout(
    xaxis_title="Foreground-to-Background Ratio",
    yaxis_title="Count",
    legend_title="Class"
)
fig.show()


Total records: 128
Sample data:
                                          image_path  class_label  fg_bg_ratio
0  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency     0.320511
1  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency     1.248634
2  /content/ukrainian_fields_evening1-1/train/ima...  Late Blight     0.800337
3  /content/ukrainian_fields_evening1-1/train/ima...      Healthy     0.294687
4  /content/ukrainian_fields_evening1-1/train/ima...   Deficiency     2.553643


In [9]:
data = []

# Function to calculate bounding box size metrics (width, height, area)
def calculate_box_size_metrics(image_path, label_path):
    image = cv2.imread(image_path)
    if image is None:
        return None, None, None

    # Get image dimensions (height, width)
    height, width = image.shape[:2]

    # Parse the YOLOv8 label file to get the bounding boxes
    labels = load_yolo_labels(label_path)

    # Initialize lists for bounding box size metrics
    widths = []
    heights = []
    areas = []

    for label in labels:
        # Extract the bounding box from the label (class x_center y_center width height)
        parts = label.split()
        x_center, y_center, w, h = float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4])

        # Convert the normalized coordinates to pixel values
        x_min = int((x_center - w / 2) * width)
        x_max = int((x_center + w / 2) * width)
        y_min = int((y_center - h / 2) * height)
        y_max = int((y_center + h / 2) * height)

        # Calculate width, height, and area of the bounding box
        width_b = x_max - x_min
        height_b = y_max - y_min
        area_b = width_b * height_b

        widths.append(width_b)
        heights.append(height_b)
        areas.append(area_b)

    return widths, heights, areas

# Step 3: Iterate through each split (train, valid, test) and calculate bounding box size metrics
for split in ['train', 'valid', 'test']:
    images_dir = os.path.join(base_dir, split, 'images')
    labels_dir = os.path.join(base_dir, split, 'labels')

    if not os.path.exists(images_dir):
        continue  # Skip missing directories

    for image_file in os.listdir(images_dir):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(images_dir, image_file)
            label_path = os.path.join(labels_dir, image_file.rsplit('.', 1)[0] + '.txt')

            # Calculate bounding box size metrics (width, height, area)
            widths, heights, areas = calculate_box_size_metrics(image_path, label_path)

            # Record the metrics for each bounding box
            for width, height, area in zip(widths, heights, areas):
                data.append({
                    'image_path': image_path,
                    'width': width,
                    'height': height,
                    'area': area
                })

# Step 4: Create DataFrame from collected data
dataset_df = pd.DataFrame(data)
print(f"Total records: {len(dataset_df)}")
print("Sample data:")
print(dataset_df.head())

# Step 5: Visualize bounding box size metrics using Plotly histograms

# Width Distribution
fig_width = px.histogram(
    dataset_df,
    x='width',
    title="Bounding Box Width Distribution(UAEvening1)",
    labels={'width': 'Bounding Box Width'},
    nbins=50,
    opacity=0.75,
)
fig_width.update_layout(
    xaxis_title="Bounding Box Width",
    yaxis_title="Count"
)
fig_width.show()

# Height Distribution
fig_height = px.histogram(
    dataset_df,
    x='height',
    title="Bounding Box Height Distribution(UAEvening1)",
    labels={'height': 'Bounding Box Height'},
    nbins=50,
    opacity=0.75,
)
fig_height.update_layout(
    xaxis_title="Bounding Box Height",
    yaxis_title="Count"
)
fig_height.show()

# Area Distribution
fig_area = px.histogram(
    dataset_df,
    x='area',
    title="Bounding Box Area Distribution(UAEvening1)",
    labels={'area': 'Bounding Box Area'},
    nbins=50,
    opacity=0.75,
)
fig_area.update_layout(
    xaxis_title="Bounding Box Area",
    yaxis_title="Count"
)
fig_area.show()


Total records: 640
Sample data:
                                          image_path  width  height   area
0  /content/ukrainian_fields_evening1-1/train/ima...     87     108   9396
1  /content/ukrainian_fields_evening1-1/train/ima...     94     121  11374
2  /content/ukrainian_fields_evening1-1/train/ima...    118      95  11210
3  /content/ukrainian_fields_evening1-1/train/ima...     73      50   3650
4  /content/ukrainian_fields_evening1-1/train/ima...    172      84  14448
