In [24]:
# import needed libraries 
import json
import pandas as pd
from pathlib import Path
from collections import Counter

In [26]:
def json_loader():
    """
    Load a COCO-style JSON file and convert it into a Python dictionary.

    Returns:
        dict: The content of the JSON file as a Python dictionary.
    """
    # Open the JSON file in read mode
    with open('../dataset/data/_annotations.coco.json', 'r') as f:
        # Parse the JSON file and convert it to a Python dictionary
        data = json.load(f)
        return data

# Call the function to load JSON data
new_data = json_loader()

# Print the top-level keys of the JSON dictionary
print(new_data.keys())

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])


In [27]:
def json_df(json_data):
    """
    Convert COCO-style JSON data into three Pandas DataFrames: categories, images, and annotations.

    Args:
        json_data (dict): JSON data loaded from a COCO-style annotation file.

    Returns:
        tuple: A tuple containing three DataFrames in the order:
            - df_categories: DataFrame of categories
            - df_images: DataFrame of images
            - df_annotations: DataFrame of annotations
    """
    # using keys: ['info', 'licenses', 'categories', 'images', 'annotations']
    # Convert the 'categories' section into a DataFrame
    df_categories = pd.DataFrame(json_data['categories'])

    # Convert the 'images' section into a DataFrame
    df_images = pd.DataFrame(json_data['images'])

    # Convert the 'annotations' section into a DataFrame
    df_annotations = pd.DataFrame(json_data['annotations'])

    return df_categories, df_images, df_annotations

# Call the function to convert JSON data into DataFrames
df_categories, df_images, df_annotations = json_df(new_data)

# Preview the DataFrames
print("Categories DataFrame:")
display(df_categories)

print("\nImages DataFrame:")
display(df_images)

print("\nAnnotations DataFrame:")
display(df_annotations)

    

Categories DataFrame:


Unnamed: 0,id,name,supercategory
0,0,wildfire,none
1,1,fire,wildfire



Images DataFrame:


Unnamed: 0,id,license,file_name,height,width,date_captured,extra
0,0,1,cl6e1qges001kgk555z158f33_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6e1qges001kgk555z158f33_2_FALSE_CO...
1,1,1,cl6kgm9qr002yc455g5qs87kz_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6kgm9qr002yc455g5qs87kz_2_FALSE_CO...
2,2,1,cl6odzs5e002oao55heuig0a6_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6odzs5e002oao55heuig0a6_1_TRUE_COL...
3,3,1,cl6b6pieh007rl455fwvb73gr_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b6pieh007rl455fwvb73gr_1_TRUE_COL...
4,4,1,cl6b5k38g003zl455bgtd1317_4_FALSE_COLOR__URBAN...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b5k38g003zl455bgtd1317_4_FALSE_CO...
...,...,...,...,...,...,...,...
495,495,1,cl6b5yjxa005nl45516k465kb_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b5yjxa005nl45516k465kb_1_TRUE_COL...
496,496,1,cl6opv5oo007xao553de9g7wd_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6opv5oo007xao553de9g7wd_2_FALSE_CO...
497,497,1,cl6cteplg00hkl4551scr5fko_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6cteplg00hkl4551scr5fko_1_TRUE_COL...
498,498,1,cl6b652mi0068l4552keu9loj_6_SWIR_jpg.rf.113908...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b652mi0068l4552keu9loj_6_SWIR.jpg'}



Annotations DataFrame:


Unnamed: 0,id,image_id,category_id,bbox,area,segmentation,iscrowd
0,0,1,1,"[277, 207, 568.95, 591.79]",336698.921,[],0
1,1,2,1,"[800, 0, 157.45, 413.38]",65086.681,[],0
2,2,2,1,"[1062, 96, 58.37, 63.91]",3730.427,[],0
3,3,2,1,"[452, 442, 410.09, 350.6]",143777.554,[],0
4,4,2,1,"[0, 163, 336.16, 375.08]",126086.893,[],0
...,...,...,...,...,...,...,...
954,954,496,1,"[80, 30, 416.05, 183.75]",76449.188,[],0
955,955,496,1,"[0, 345, 140.33, 146.25]",20523.263,[],0
956,956,497,1,"[389, 361, 333, 340.52]",113393.160,[],0
957,957,498,1,"[510, 270, 222.6, 245.95]",54748.470,[],0


##                                      2-A Exploration & nettoyage des données 

In [28]:
def check_nan(df):
    """
    Check for missing values (NaN) in a DataFrame and return only the columns with at least one NaN.

    Args:
        df (pd.DataFrame): The DataFrame to check for missing values.

    Returns:
        pd.Series: A series containing only the columns with NaN values and their counts.
    """
    # Count NaN values per column
    nan_counts = df.isna().sum()
    
    # Filter only columns where NaN count > 0
    nan_columns = nan_counts[nan_counts > 0]
    
    return nan_columns

#  usage
clean_anno = check_nan(df_annotations)
clean_images = check_nan(df_images)
clean_categ = check_nan(df_categories)

# Only print the result if it contains data, for a cleaner output


# Annotations DataFrame
if not clean_anno.empty:
    print("Annotations NaN columns:")
    print(clean_anno)
else:
    print("No NaN values in Annotations DataFrame")
    
    
# Images DataFrame
if not clean_images.empty:
    print("Images NaN columns:")
    print(clean_images)
else:
    print("No NaN values in Images DataFrame")
    
    
# Categories DataFrame
if not clean_categ.empty:
    print("Categories NaN columns:")
    print(clean_categ)
else:
    print("No NaN values in Categories DataFrame")



No NaN values in Annotations DataFrame
No NaN values in Images DataFrame
No NaN values in Categories DataFrame


In [29]:
def total_number(df):
    """
    Calculate the total number of rows in a DataFrame.

    This can be used to count the total number of images or annotations,
    depending on which DataFrame we pass.

    Args:
        df (pd.DataFrame): The DataFrame to count rows from.

    Returns:
        int: Total number of rows in the DataFrame.
    """
    total = len(df)
    return total

num_images = total_number(df_images)
print(f"Total number of images: {num_images}")

num_annotations = total_number(df_annotations)
print(f"Total number of annotations: {num_annotations}")


Total number of images: 500
Total number of annotations: 959


In [30]:
def categories_names(df):
    """
    Return a list of category names from a categories DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the 'name' column for categories.

    Returns:
        list: List of category names.
    """
    category_name = df['name'].tolist()
    return category_name

category_list = categories_names(df_categories)
print("Category names:")
print(category_list)


Category names:
['wildfire', 'fire']


In [31]:
def images_per_category(df_annotations, df_categories):
    """
    Calculate the number of unique images per category.

    Args:
        df_annotations (pd.DataFrame): DataFrame containing 'image_id' and 'category_id'.
        df_categories (pd.DataFrame): DataFrame containing 'id' and 'name' for categories.

    Returns:
        pd.Series: Number of unique images per category.
    """
    # Merge annotations with categories
    merged = df_annotations.merge(df_categories, left_on='category_id', right_on='id')
    #display(merged)
    # Drop duplicate image-category pairs
    unique_images = merged[['image_id', 'name']].drop_duplicates()
    
    # Count images per category
    count_per_category = unique_images.groupby('name').size()
    return count_per_category

# Example usage
images_count = images_per_category(df_annotations, df_categories)
print(images_count)


name
fire    493
dtype: int64


In [32]:
def annotations_statistics(df_annotations):
    """
    Compute the number of annotations per image and provide summary statistics.

    Args:
        df_annotations (pd.DataFrame): DataFrame containing annotations with a column 'image_id'.

    Returns:
        
            - annotations_per_image (pd.Series): Number of annotations for each image.
            - stats (pd.Series): Summary statistics of annotations per image (count, mean, min, max, quartiles).
    """
    # Count the number of annotations per image
    annotations_per_image = df_annotations.groupby('image_id').size()

    # Compute summary statistics
    stats = annotations_per_image.describe()

    return annotations_per_image, stats


# Example usage
annotations_per_image, stats = annotations_statistics(df_annotations)

print("Number of annotations per image (preview):")
print(annotations_per_image.head())

print("\nSummary statistics of annotations per image:")
print(stats)


Number of annotations per image (preview):
image_id
1    1
2    4
3    1
4    1
5    2
dtype: int64

Summary statistics of annotations per image:
count    493.000000
mean       1.945233
std        1.405218
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       10.000000
dtype: float64


##                          2-b Vérification des incohérences dans les données 


In [33]:
def cntr(folder_path):
    """
    Count file extensions in a folder.

    Args:
        folder_path (str): Path to the folder.

    Returns:
        Counter: Counts of each file extension in the folder.
    """
    # p = Path(folder_path)
    # exts_list = []

    # # Loop over each file in the folder
    
    # for f in p.iterdir():
    #     if f.is_file():  # Only process files
    #         ext = f.suffix.lower().lstrip('.')  # Get extension without dot
    #         exts_list.append(ext)

    # # Count how many times each extension appears
    # return Counter(exts_list)
    
    p = Path(folder_path)
    exts = [f.suffix.lower().lstrip('.') for f in p.iterdir() if f.is_file()]
    return Counter(exts)

exts = cntr("../dataset")   
print(exts)

Counter({'': 1})


In [10]:
# p = Path("../dataset")
# n_names = []
# for n in p.iterdir(): 
#     if n.is_file():
#         n_names.append(n.name)
#     else:
#         # i can delete this part to make the code more flixible 
#         raise FileNotFoundError(f" file {n} is not exist")        
# # display(n_names)
# img_names = df_images['file_name'].to_list()
# # print("@" * 50)
# # display(img_names)

# folder_images = set(n_names)
# names_list = set(img_names)

# missing_image_folder = names_list - folder_images
# print(missing_image_folder)

# missing_in_json = folder_images - names_list
# print(missing_in_json)

In [34]:
def verify_images(df: pd.DataFrame, column_name: str, folder_path: str):
    """
    Verify consistency between image names in a DataFrame and actual images in a folder.
    """
    p = Path(folder_path)

    # Valid image extensions
    valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

    # Collect only image file names in the folder
    n_names = [n.name for n in p.iterdir() if n.is_file() and n.suffix.lower() in valid_exts]

    # Get image names from DataFrame column
    img_names = df[column_name].to_list()

    # Convert both to sets
    set_folder = set(n_names)
    set_df = set(img_names)

    # Compare
    missing_in_folder = set_df - set_folder
    missing_in_df = set_folder - set_df

    return missing_in_folder, missing_in_df


# Example usage
missing_in_folder, missing_in_df = verify_images(df_images, "file_name", "../dataset/data")

print("Images in DataFrame but not in folder:", missing_in_folder)
print("Images in folder but not in DataFrame:", missing_in_df)


Images in DataFrame but not in folder: {'cl6e2kygp002egk55asrs2brz_1_TRUE_COLOR_jpg.rf.2a198a37785242fb6db3ef21c0ffdcad.jpg', 'cl6b5x63r005il4551chxdt93_2_FALSE_COLOR_jpg.rf.8c4fb06b6812031edb1ad781ec14d2b9.jpg', 'cl6kfx47x001tc45578ts0yz6_2_FALSE_COLOR_jpg.rf.5a9501946306fabdad807a6acb61ccbe.jpg', 'cl6kf5xzo000gc4552qc8hhcc_2_FALSE_COLOR_jpg.rf.8fbfb16c6d86076d860904397218acd5.jpg', 'cl6e3enfn003sgk554uim9wo6_1_TRUE_COLOR_jpg.rf.beaf169c01f4596cb24c1c4628ef21f1.jpg', 'cl6b5myi60048l45530d5anq4_1_TRUE_COLOR_jpg.rf.1b9e229b9d66a61c1d9a410fd8ad6f23.jpg', 'cl6e1qges001kgk555z158f33_2_FALSE_COLOR_jpg.rf.479904c9e54c6ba121689341598bf3ed.jpg'}
Images in folder but not in DataFrame: set()


In [35]:
def get_images_without_annotations(images_folder, annotations_file):
    """
    Return a list of images that have no annotations.
    """
    # Load COCO file
    with open(annotations_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Get all image IDs that have at least one annotation
    annotated_image_ids = {ann["image_id"] for ann in data["annotations"]}

    # Map image IDs to file names
    id_to_name = {img["id"]: img["file_name"] for img in data["images"]}
    annotated_images = {id_to_name[iid] for iid in annotated_image_ids}

    # Valid image extensions
    valid_exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

    # Get all images in the folder
    folder = Path(images_folder)
    all_images = {f.name for f in folder.iterdir() if f.is_file() and f.suffix.lower() in valid_exts}

    # Find images without annotations
    images_without_ann = all_images - annotated_images

    return list(images_without_ann)

# Example usage
result = get_images_without_annotations("../dataset", "../dataset/data/_annotations.coco.json")

if result:
    display("Images without annotations are:", list(result))
else:
    print("There are no images without annotations.")


There are no images without annotations.


In [36]:
# Créer une liste d’id unique dans les annotations (pandas unique)
def unique_ids(df):
    
    unique_ids = df_annotations['image_id'].unique()

    unique_ids_list = unique_ids.tolist()
    return unique_ids_list
unique_ids_list = unique_ids(df_annotations)

print("Unique IDs:", unique_ids_list)


Unique IDs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,

In [37]:
images_without_annotations = df_annotations[~df_annotations['image_id'].isin(unique_ids_list)]
print(images_without_annotations)

Empty DataFrame
Columns: [id, image_id, category_id, bbox, area, segmentation, iscrowd]
Index: []


In [38]:
from PIL import Image
img = Image.open("../dataset/data/cl6az56oy0003f05554mh92wy_2_FALSE_COLOR_jpg.rf.80849284d2ee075dcfd0d457f4c20b7a.jpg")
#img.show()
print("Width, Height:", img.size)

Width, Height: (1200, 860)


In [39]:
# Écrire une fonction qui détecte les valeurs aberrantes pour les annotations (ex : hauteur == 0 et largeur != 0, etc)
df_annotations['bbox']

0      [277, 207, 568.95, 591.79]
1        [800, 0, 157.45, 413.38]
2        [1062, 96, 58.37, 63.91]
3       [452, 442, 410.09, 350.6]
4        [0, 163, 336.16, 375.08]
                  ...            
954      [80, 30, 416.05, 183.75]
955      [0, 345, 140.33, 146.25]
956       [389, 361, 333, 340.52]
957     [510, 270, 222.6, 245.95]
958     [645, 519, 87.51, 126.38]
Name: bbox, Length: 959, dtype: object

In [40]:

def detect_bbox_outliers(df: pd.DataFrame):
    """
    Detect outliers in the 'bbox' column:
    - width or height = 0
    - width or height < 0
    Returns a DataFrame of the outlier rows.
    """
    widths = df['bbox'].apply(lambda x: x[2])
    heights = df['bbox'].apply(lambda x: x[3])

    condition = (widths <= 0) | (heights <= 0)

    outliers = df[condition]

    return outliers
outlier_annotations = detect_bbox_outliers(df_annotations)

print("Outlier annotations:")
print(outlier_annotations)


Outlier annotations:
Empty DataFrame
Columns: [id, image_id, category_id, bbox, area, segmentation, iscrowd]
Index: []


In [None]:
# def delete_images(images_folder_path, images_list):
#     """
#     Delete images from a folder based on a list of image file names.

#     Parameters:
#     images_folder_path (str or Path): Path to the folder containing images
#     images_list (list): List of image file names to delete
#     """
#     # Ensure we have a Path object
#     images_folder = Path(images_folder_path)

#     # Loop through each image name
#     for img_name in images_list:
#         img_path = images_folder / img_name  # Full path to the image
#         if img_path.exists():               # Check if the file exists
#             img_path.unlink()               # Delete the file
#             print(f"Deleted {img_name}")
#         else:
#             print(f"{img_name} not found")
# delete_pics = delete_images("../dataset/data", result)


In [None]:
# def update_coco_file(coco_file_path, deleted_images):
#     """
#     Remove entries of deleted images and their annotations from a COCO JSON file.
    
#     Parameters:
#     coco_file_path (str): Path to the COCO JSON file
#     deleted_images (list): List of deleted image file names
#     """
#     # Load the COCO JSON file
#     with open(coco_file_path, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     # Find IDs of deleted images
#     deleted_ids = {img["id"] for img in data["images"] if img["file_name"] in deleted_images}

#     # Remove deleted images from "images"
#     data["images"] = [img for img in data["images"] if img["id"] not in deleted_ids]

#     # Remove annotations for deleted images
#     data["annotations"] = [ann for ann in data["annotations"] if ann["image_id"] not in deleted_ids]

#     # Save the updated JSON
#     with open(coco_file_path, "w", encoding="utf-8") as f:
#         json.dump(data, f, ensure_ascii=False, indent=4)

#     print("COCO JSON updated successfully!")


In [None]:
# # Delete the images
# delete_images("../dataset", result)

# # Update the COCO JSON using the same list
# update_coco_file("../dataset/data/_annotations.coco.json", result)


COCO JSON updated successfully!


In [49]:
def clean_coco(coco_file, images_dir):
    images_dir = Path(images_dir)

    with open(coco_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    existing_images = {f.name for f in images_dir.iterdir() if f.is_file()}

    coco_images = {img["file_name"] for img in data["images"]}

    missing = coco_images - existing_images
    print("Images missing from folder:", missing)

    valid_ids = {img["id"] for img in data["images"] if img["file_name"] in existing_images}
    data["images"] = [img for img in data["images"] if img["id"] in valid_ids]
    data["annotations"] = [ann for ann in data["annotations"] if ann["image_id"] in valid_ids]

    with open(coco_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print("COCO file cleaned successfully!")

clean_coco("../dataset/data/_annotations.coco.json", "../dataset/data")


Images missing from folder: set()
COCO file cleaned successfully!


In [None]:
import fiftyone as fo
import fiftyone.zoo as foz


dataset = fo.Dataset.from_dir(
    dataset_dir="../dataset",
    dataset_type=fo.types.COCODetectionDataset,
    labels_path="../dataset/data/_annotations.coco.json"
)
session = fo.launch_app(dataset)


 100% |█████████████████| 493/493 [373.9ms elapsed, 0s remaining, 1.3K samples/s]      



Welcome to

███████╗██╗███████╗████████╗██╗   ██╗ ██████╗ ███╗   ██╗███████╗
██╔════╝██║██╔════╝╚══██╔══╝╚██╗ ██╔╝██╔═══██╗████╗  ██║██╔════╝
█████╗  ██║█████╗     ██║    ╚████╔╝ ██║   ██║██╔██╗ ██║█████╗
██╔══╝  ██║██╔══╝     ██║     ╚██╔╝  ██║   ██║██║╚██╗██║██╔══╝
██║     ██║██║        ██║      ██║   ╚██████╔╝██║ ╚████║███████╗
╚═╝     ╚═╝╚═╝        ╚═╝      ╚═╝    ╚═════╝ ╚═╝  ╚═══╝╚══════╝ v1.8.0

If you're finding FiftyOne helpful, here's how you can get involved:

|
|  ⭐⭐⭐ Give the project a star on GitHub ⭐⭐⭐
|  https://github.com/voxel51/fiftyone
|
|  🚀🚀🚀 Join the FiftyOne Discord community 🚀🚀🚀
|  https://community.voxel51.com/
|

