In [None]:
# import needed libraries 
import json
import pandas as pd

In [None]:
def json_loader():
    """
    Load a COCO-style JSON file and convert it into a Python dictionary.

    Returns:
        dict: The content of the JSON file as a Python dictionary.
    """
    # Open the JSON file in read mode
    with open('../dataset/_annotations.coco.json', 'r') as f:
        # Parse the JSON file and convert it to a Python dictionary
        data = json.load(f)
        return data

# Call the function to load JSON data
new_data = json_loader()

# Print the top-level keys of the JSON dictionary
print(new_data.keys())

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])


In [None]:
def json_df(json_data):
    """
    Convert COCO-style JSON data into three Pandas DataFrames: categories, images, and annotations.

    Args:
        json_data (dict): JSON data loaded from a COCO-style annotation file.

    Returns:
        tuple: A tuple containing three DataFrames in the order:
            - df_categories: DataFrame of categories
            - df_images: DataFrame of images
            - df_annotations: DataFrame of annotations
    """
    # using keys: ['info', 'licenses', 'categories', 'images', 'annotations']
    # Convert the 'categories' section into a DataFrame
    df_categories = pd.DataFrame(json_data['categories'])

    # Convert the 'images' section into a DataFrame
    df_images = pd.DataFrame(json_data['images'])

    # Convert the 'annotations' section into a DataFrame
    df_annotations = pd.DataFrame(json_data['annotations'])

    return df_categories, df_images, df_annotations

# Call the function to convert JSON data into DataFrames
df_categories, df_images, df_annotations = json_df(new_data)

# Preview the DataFrames
print("Categories DataFrame:")
display(df_categories)

print("\nImages DataFrame:")
display(df_images)

print("\nAnnotations DataFrame:")
display(df_annotations)

    

Categories DataFrame:


Unnamed: 0,id,name,supercategory
0,0,wildfire,none
1,1,fire,wildfire



Images DataFrame:


Unnamed: 0,id,license,file_name,height,width,date_captured,extra
0,0,1,cl6e1qges001kgk555z158f33_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6e1qges001kgk555z158f33_2_FALSE_CO...
1,1,1,cl6kgm9qr002yc455g5qs87kz_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6kgm9qr002yc455g5qs87kz_2_FALSE_CO...
2,2,1,cl6odzs5e002oao55heuig0a6_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6odzs5e002oao55heuig0a6_1_TRUE_COL...
3,3,1,cl6b6pieh007rl455fwvb73gr_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b6pieh007rl455fwvb73gr_1_TRUE_COL...
4,4,1,cl6b5k38g003zl455bgtd1317_4_FALSE_COLOR__URBAN...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b5k38g003zl455bgtd1317_4_FALSE_CO...
...,...,...,...,...,...,...,...
495,495,1,cl6b5yjxa005nl45516k465kb_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b5yjxa005nl45516k465kb_1_TRUE_COL...
496,496,1,cl6opv5oo007xao553de9g7wd_2_FALSE_COLOR_jpg.rf...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6opv5oo007xao553de9g7wd_2_FALSE_CO...
497,497,1,cl6cteplg00hkl4551scr5fko_1_TRUE_COLOR_jpg.rf....,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6cteplg00hkl4551scr5fko_1_TRUE_COL...
498,498,1,cl6b652mi0068l4552keu9loj_6_SWIR_jpg.rf.113908...,860,1200,2025-09-14T12:06:19+00:00,{'name': 'cl6b652mi0068l4552keu9loj_6_SWIR.jpg'}



Annotations DataFrame:


Unnamed: 0,id,image_id,category_id,bbox,area,segmentation,iscrowd
0,0,1,1,"[277, 207, 568.95, 591.79]",336698.921,[],0
1,1,2,1,"[800, 0, 157.45, 413.38]",65086.681,[],0
2,2,2,1,"[1062, 96, 58.37, 63.91]",3730.427,[],0
3,3,2,1,"[452, 442, 410.09, 350.6]",143777.554,[],0
4,4,2,1,"[0, 163, 336.16, 375.08]",126086.893,[],0
...,...,...,...,...,...,...,...
954,954,496,1,"[80, 30, 416.05, 183.75]",76449.188,[],0
955,955,496,1,"[0, 345, 140.33, 146.25]",20523.263,[],0
956,956,497,1,"[389, 361, 333, 340.52]",113393.160,[],0
957,957,498,1,"[510, 270, 222.6, 245.95]",54748.470,[],0


##                                      2- Exploration & nettoyage des données 

In [18]:
def check_nan(df):
    """
    Check for missing values (NaN) in a DataFrame and return only the columns with at least one NaN.

    Args:
        df (pd.DataFrame): The DataFrame to check for missing values.

    Returns:
        pd.Series: A series containing only the columns with NaN values and their counts.
    """
    # Count NaN values per column
    nan_counts = df.isna().sum()
    
    # Filter only columns where NaN count > 0
    nan_columns = nan_counts[nan_counts > 0]
    
    return nan_columns

#  usage
clean_anno = check_nan(df_annotations)
clean_images = check_nan(df_images)
clean_categ = check_nan(df_categories)

# Only print the result if it contains data, for a cleaner output


# Annotations DataFrame
if not clean_anno.empty:
    print("Annotations NaN columns:")
    print(clean_anno)
else:
    print("No NaN values in Annotations DataFrame")
    
    
# Images DataFrame
if not clean_images.empty:
    print("Images NaN columns:")
    print(clean_images)
else:
    print("No NaN values in Images DataFrame")
    
    
# Categories DataFrame
if not clean_categ.empty:
    print("Categories NaN columns:")
    print(clean_categ)
else:
    print("No NaN values in Categories DataFrame")



No NaN values in Annotations DataFrame
No NaN values in Images DataFrame
No NaN values in Categories DataFrame


In [None]:
def total_number(df):
    """
    Calculate the total number of rows in a DataFrame.

    This can be used to count the total number of images or annotations,
    depending on which DataFrame we pass.

    Args:
        df (pd.DataFrame): The DataFrame to count rows from.

    Returns:
        int: Total number of rows in the DataFrame.
    """
    total = len(df)
    return total

num_images = total_number(df_images)
print(f"Total number of images: {num_images}")

num_annotations = total_number(df_annotations)
print(f"Total number of annotations: {num_annotations}")


Total number of images: 500
Total number of annotations: 959


In [None]:
def categories_names(df):
    """
    Return a list of category names from a categories DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing the 'name' column for categories.

    Returns:
        list: List of category names.
    """
    category_name = df['name'].tolist()
    return category_name

category_list = categories_names(df_categories)
print("Category names:")
print(category_list)


Category names:
['wildfire', 'fire']


In [19]:
def images_per_category(df_annotations, df_categories):
    """
    Calculate the number of unique images per category.

    Args:
        df_annotations (pd.DataFrame): DataFrame containing 'image_id' and 'category_id'.
        df_categories (pd.DataFrame): DataFrame containing 'id' and 'name' for categories.

    Returns:
        pd.Series: Number of unique images per category.
    """
    # Merge annotations with categories
    merged = df_annotations.merge(df_categories, left_on='category_id', right_on='id')
    
    # Drop duplicate image-category pairs
    unique_images = merged[['image_id', 'name']].drop_duplicates()
    
    # Count images per category
    count_per_category = unique_images.groupby('name').size()
    return count_per_category

# Example usage
images_count = images_per_category(df_annotations, df_categories)
print(images_count)


name
fire    493
dtype: int64


In [None]:
def annotations_statistics(df_annotations):
    """
    Compute the number of annotations per image and provide summary statistics.

    Args:
        df_annotations (pd.DataFrame): DataFrame containing annotations with a column 'image_id'.

    Returns:
        
            - annotations_per_image (pd.Series): Number of annotations for each image.
            - stats (pd.Series): Summary statistics of annotations per image (count, mean, min, max, quartiles).
    """
    # Count the number of annotations per image
    annotations_per_image = df_annotations.groupby('image_id').size()

    # Compute summary statistics
    stats = annotations_per_image.describe()

    return annotations_per_image, stats


# Example usage
annotations_per_image, stats = annotations_statistics(df_annotations)

print("Number of annotations per image (preview):")
print(annotations_per_image.head())

print("\nSummary statistics of annotations per image:")
print(stats)


Number of annotations per image (preview):
image_id
1    1
2    4
3    1
4    1
5    2
dtype: int64

Summary statistics of annotations per image:
count    493.000000
mean       1.945233
std        1.405218
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       10.000000
dtype: float64
