In [None]:
import yaml

file_path = '/content/drive/MyDrive/Colabim/bdp/training_data/data.yaml'

try:
    with open(file_path, 'r') as file:
        yaml_content = yaml.safe_load(file)
    print(yaml_content)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except yaml.YAMLError as e:
    print(f"Error parsing YAML file: {e}")

{'names': ['crow', 'pigeon', 'plane', 'seagull', 'stork', 'swallow', 'unknown_bird'], 'nc': 7, 'path': '/content/drive/MyDrive/Colabim/bdp/training_data', 'train': '/content/drive/MyDrive/Colabim/bdp/training_data/train/images', 'val': '/content/drive/MyDrive/Colabim/bdp/training_data/val/images'}


In [None]:
cp -r /content/drive/MyDrive/Colabim/bdp/training_data /content/drive/MyDrive/Colabim/bdp__

In [None]:
pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.216-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.216-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.216 ultralytics-thop-2.0.17


## Filter bounding boxes by size

### Subtask:
Iterate through the relabeled label files, calculate the bounding box area in pixels for each entry, and keep only those with an area of 80 square pixels or more.

**Reasoning**:
Iterate through each relabeled label file, calculate the bounding box area using the provided image dimensions, filter out bounding boxes smaller than the specified threshold, and store the filtered lines.

In [None]:
import os

filtered_relabeled_data = {}
image_width = 1920
image_height = 1080
area_threshold = 80

for file_name, lines in relabeled_data.items():
    filtered_lines = []
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 5: # Expecting class_id, center_x, center_y, width, height
            try:
                # YOLO format: <class_id> <center_x> <center_y> <width> <height> (normalized)
                # Bounding box coordinates are normalized to the image size.
                # To get pixel values, multiply by image dimensions.
                class_id = int(parts[0])
                center_x = float(parts[1])
                center_y = float(parts[2])
                width_normalized = float(parts[3])
                height_normalized = float(parts[4])

                # Convert normalized width and height to pixel values
                width_pixels = width_normalized * image_width
                height_pixels = height_normalized * image_height

                # Calculate area in pixels
                area_pixels = width_pixels * height_pixels

                if area_pixels >= area_threshold:
                    filtered_lines.append(line)
            except ValueError:
                print(f"Warning: Skipping malformed line in file {file_name}: {line.strip()}")
        else:
            print(f"Warning: Skipping malformed line with incorrect number of parts in file {file_name}: {line.strip()}")

    filtered_relabeled_data[file_name] = filtered_lines

# Display a sample of the filtered data for one file
if filtered_relabeled_data:
    sample_file_name = list(filtered_relabeled_data.keys())[0]
    print(f"Sample filtered data for {sample_file_name}:")
    for line in filtered_relabeled_data[sample_file_name][:5]: # Print only the first 5 lines
        print(line)

Sample filtered data for frame_000165.txt:
6 0.106526 0.014102 0.016875 0.017852
6 0.161758 0.256528 0.025526 0.012778
6 0.041156 0.132102 0.025729 0.009537
6 0.155328 0.246898 0.017969 0.018148
6 0.934201 0.091556 0.017495 0.013500


## Save filtered label files

### Subtask:
Overwrite the label files in the new data directory with the filtered content.

**Reasoning**:
Iterate through the filtered_relabeled_data dictionary and write the filtered content to the original label files.

In [None]:
import os

labels_dir = os.path.join('/content/drive/MyDrive/Colabim/bdp/KARGA3_m', 'labels', 'train')

for file_name, lines in filtered_relabeled_data.items():
    file_path = os.path.join(labels_dir, file_name)
    with open(file_path, 'w') as file:
        for line in lines:
            file.write(line + '\n')

print("Filtered data successfully written to original label files.")

Filtered data successfully written to original label files.


## Summary:

### Data Analysis Key Findings

* Bounding boxes with an area less than 80 square pixels have been filtered out from the label files in `/content/drive/MyDrive/Colabim/bdp/KARGA3_m/labels/train`.
* The original label files have been overwritten with the filtered content.

### Insights or Next Steps

* The label files in the new dataset now only contain bounding boxes that meet the specified size threshold, further preparing them for integration with the existing training data.
* The next step would be to integrate these processed label files and their corresponding image files into the existing training data structure or pipeline.

In [None]:
get_ipython().system('unzip /content/drive/MyDrive/Colabim/KARGA3_m.zip -d /content/drive/MyDrive/Colabim/bdp/KARGA3_m')

Archive:  /content/drive/MyDrive/Colabim/KARGA3_m.zip
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/train.txt  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/data.yaml  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000300.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000645.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000735.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000720.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000180.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000090.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000630.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000315.png  
 extracting: /content/drive/MyDrive/Colabim/bdp/KARGA3_m/images/train/frame_000165.png  
 extrac

## Load class names

### Subtask:
Read the class names from the `data.yaml` file in the existing training data.


**Reasoning**:
Extract the class names from the loaded YAML content.



In [None]:
existing_class_names = yaml_content['names']
print(existing_class_names)

['crow', 'pigeon', 'plane', 'seagull', 'stork', 'swallow', 'unknown_bird']


## Identify new data label files

### Subtask:
Locate all the label files (e.g., `.txt` files) in the newly unzipped data directory (`/content/drive/MyDrive/Colabim/bdp/KARGA3_m`).


**Reasoning**:
Locate all the label files (e.g., `.txt` files) in the newly unzipped data directory (`/content/drive/MyDrive/Colabim/bdp/KARGA3_m`).



In [None]:
import os

new_data_dir = '/content/drive/MyDrive/Colabim/bdp/KARGA3_m'
labels_dir = os.path.join(new_data_dir, 'labels', 'train')
all_files = os.listdir(labels_dir)
new_label_files = [f for f in all_files if f.endswith('.txt')]
print(new_label_files)

['frame_000165.txt', 'frame_000345.txt', 'frame_000705.txt', 'frame_000675.txt', 'frame_000330.txt', 'frame_000240.txt', 'frame_000180.txt', 'frame_000090.txt', 'frame_000315.txt', 'frame_000300.txt', 'frame_000645.txt']


## Determine class mapping

### Subtask:
Understand the mapping between the class indices in the new data's label files and the class names. This might require examining a `data.yaml` file within the new data (if one exists) or getting this information from the user.


**Reasoning**:
Check for the existence of `data.yaml` in the new data directory, load it if it exists, extract the class names, and compare them to the existing class names to determine the mapping.



In [None]:
import os
import yaml

new_data_dir = '/content/drive/MyDrive/Colabim/bdp/KARGA3_m'
new_data_yaml_path = os.path.join(new_data_dir, 'data.yaml')

if os.path.exists(new_data_yaml_path):
    try:
        with open(new_data_yaml_path, 'r') as file:
            new_yaml_content = yaml.safe_load(file)
        new_class_names = new_yaml_content.get('names')

        if new_class_names is not None:
            print("New data class names:", new_class_names)
            print("Existing data class names:", existing_class_names)

            # Determine the mapping
            class_mapping = {}
            for new_idx, new_name in enumerate(new_class_names):
                try:
                    existing_idx = existing_class_names.index(new_name)
                    class_mapping[new_idx] = existing_idx
                except ValueError:
                    print(f"Warning: Class '{new_name}' from new data not found in existing classes.")
                    # Handle classes not found in existing data if necessary, e.g., map to a specific index or skip
                    # For this task, we'll just warn and not include it in the mapping if not found.

            print("Class mapping (new_index: existing_index):", class_mapping)

        else:
            print("Error: 'names' key not found in the new data's data.yaml file.")
            print("Manual input is needed for class mapping.")

    except yaml.YAMLError as e:
        print(f"Error parsing new data's data.yaml file: {e}")
        print("Manual input is needed for class mapping.")
else:
    print(f"Error: data.yaml not found in {new_data_dir}.")
    print("Manual input is needed for class mapping.")


New data class names: {0: 'pigeon', 1: 'plane', 2: 'unknown_bird', 3: 'stork', 4: 'crow', 5: 'seagull', 6: 'swallow'}
Existing data class names: ['crow', 'pigeon', 'plane', 'seagull', 'stork', 'swallow', 'unknown_bird']
Class mapping (new_index: existing_index): {}


**Reasoning**:
The previous output shows that the class names in the new data's `data.yaml` are indexed by numbers instead of being a list of strings, causing the mapping logic to fail. I need to correct the mapping logic to handle this dictionary format and then determine the class mapping based on the class names.



In [None]:
# Determine the mapping from new indices to existing indices based on class names
class_mapping = {}
if isinstance(new_class_names, dict):
    # Handle the case where new_class_names is a dictionary
    for new_idx, new_name in new_class_names.items():
        try:
            existing_idx = existing_class_names.index(new_name)
            class_mapping[new_idx] = existing_idx
        except ValueError:
            print(f"Warning: Class '{new_name}' from new data not found in existing classes.")
else:
    # Handle the case where new_class_names is a list (original logic)
    for new_idx, new_name in enumerate(new_class_names):
        try:
            existing_idx = existing_class_names.index(new_name)
            class_mapping[new_idx] = existing_idx
        except ValueError:
            print(f"Warning: Class '{new_name}' from new data not found in existing classes.")

print("Corrected Class mapping (new_index: existing_index):", class_mapping)


Corrected Class mapping (new_index: existing_index): {0: 1, 1: 2, 2: 6, 3: 4, 4: 0, 5: 3, 6: 5}


## Relabel classes

### Subtask:
Iterate through each label file in the new data. For each bounding box entry, update the class index based on the mapping determined in the previous step and the class names loaded from the existing `data.yaml`.


**Reasoning**:
Iterate through each label file, read its contents, update the class indices based on the `class_mapping`, and store the relabeled content in a dictionary.



In [None]:
relabeled_data = {}

for file_name in new_label_files:
    file_path = os.path.join(labels_dir, file_name)
    relabeled_lines = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) >= 1:
                try:
                    old_class_index = int(parts[0])
                    if old_class_index in class_mapping:
                        new_class_index = class_mapping[old_class_index]
                        relabeled_line_parts = [str(new_class_index)] + parts[1:]
                        relabeled_lines.append(" ".join(relabeled_line_parts))
                    else:
                        # If a class index is not in the mapping, keep the original line or handle as needed
                        # For now, we will keep the original line with a warning.
                        print(f"Warning: Class index {old_class_index} in file {file_name} not found in mapping.")
                        relabeled_lines.append(line.strip())
                except ValueError:
                    # Handle lines that don't start with a valid integer class index
                    print(f"Warning: Skipping malformed line in file {file_name}: {line.strip()}")
                    relabeled_lines.append(line.strip()) # Keep the original line if malformed
            else:
                 # Handle empty or malformed lines
                 print(f"Warning: Skipping empty or malformed line in file {file_name}: {line.strip()}")
                 relabeled_lines.append(line.strip()) # Keep the original line if malformed

    relabeled_data[file_name] = relabeled_lines

# Display a sample of the relabeled data for one file
if relabeled_data:
    sample_file_name = list(relabeled_data.keys())[0]
    print(f"Sample relabeled data for {sample_file_name}:")
    for line in relabeled_data[sample_file_name][:5]: # Print only the first 5 lines
        print(line)

Sample relabeled data for frame_000165.txt:
6 0.106526 0.014102 0.016875 0.017852
6 0.161758 0.256528 0.025526 0.012778
6 0.041156 0.132102 0.025729 0.009537
6 0.155328 0.246898 0.017969 0.018148
6 0.934201 0.091556 0.017495 0.013500


## Save modified label files

### Subtask:
Overwrite the original label files in the new data with the relabeled content.


**Reasoning**:
Iterate through the relabeled_data dictionary and write the relabeled content to the original label files.



In [None]:
for file_name, lines in relabeled_data.items():
    file_path = os.path.join(labels_dir, file_name)
    with open(file_path, 'w') as file:
        for line in lines:
            file.write(line + '\n')

print("Relabeled data successfully written to original label files.")

Relabeled data successfully written to original label files.
