In [22]:
import pandas as pd
import numpy as np
from PIL import Image
import os
import ast
from analyze_masks import analyze_masks_and_list_exceptions

def categorize_data(data, column):
    """ Categorize the data based on quantiles for a specific column """
    # Calculate the 33rd and 66th percentiles
    q33 = data[column].quantile(0.33)
    q66 = data[column].quantile(0.66)
    
    # Define the category labels
    category_labels = ["small", "medium", "large"]
    
    # Create a new column for categories based on the quantiles
    conditions = [
        (data[column] <= q33),
        (data[column] > q33) & (data[column] <= q66),
        (data[column] > q66)
    ]
    
    data[f'{column}_category'] = np.select(conditions, category_labels)
    return data

In [23]:
def load_and_process_data(file_path, base_dir, masks_dir):
    data = pd.read_csv(file_path)
    confidence_columns = [col for col in data.columns if 'confidence' in col]
    for column in confidence_columns:
        data[column] = data[column].apply(ast.literal_eval)

    data['width'] = 0
    data['height'] = 0
    data['object_percentage'] = 0.0

    class_to_grayscale_map = analyze_masks_and_list_exceptions(masks_dir)

    for index, row in data.iterrows():
        class_id = row['id'].split('_')[0]
        picture_name = row['picture_name']
        picture_base_name = os.path.splitext(picture_name)[0]

        img_path = os.path.join(base_dir, 'train', class_id, picture_name)
        try:
            with Image.open(img_path) as img:
                width, height = img.size
                data.at[index, 'width'] = width
                data.at[index, 'height'] = height
        except FileNotFoundError:
            print(f"Image not found: {img_path}")

        mask_name = picture_base_name + '.png'
        mask_path = os.path.join(masks_dir, class_id, mask_name)
        try:
            with Image.open(mask_path) as mask:
                mask_array = np.array(mask)
                if class_id in class_to_grayscale_map:
                    relevant_value = class_to_grayscale_map[class_id]
                    object_pixels = np.sum(mask_array == relevant_value)
                    total_pixels = width * height
                    data.at[index, 'object_percentage'] = (object_pixels / total_pixels) * 100
        except FileNotFoundError:
            print(f"Mask not found for image: {mask_path}")

    data = categorize_data(data, 'object_percentage')

    return data

In [24]:
def parse_synset_mapping(filepath):
    with open(filepath, 'r') as file:
        class_dict = {}
        for row_number, line in enumerate(file, start=1):
            class_id, description = line.strip().split(' ', 1)
            class_dict[class_id] = {
                "description": description,
                "value": row_number - 1
            }
    return class_dict

In [25]:
base_dir = '../data'
masks_dir = '../data/masks'
file_path_resnet = '../image_confidence_scores_resnet.csv'
file_path_convnext = '../image_confidence_scores_convnext.csv'
synset_path = '../data/LOC_synset_mapping.txt'

data_resnet = load_and_process_data(file_path_resnet, base_dir, masks_dir)
data_convnext = load_and_process_data(file_path_convnext, base_dir, masks_dir)
class_dict = parse_synset_mapping(synset_path)

category_distribution_resnet = data_resnet['object_percentage_category'].value_counts()

Class: n02412080, Most Common Nonzero Grayscale Value (by presence): 17, Presence Count: 982
    Images without the most common grayscale value (17): ['n02412080_13145.png', 'n02412080_1976.png', 'n02412080_6399.png', 'n02412080_16811.png', 'n02412080_2188.png', 'n02412080_19324.png', 'n02412080_18733.png', 'n02412080_2270.png', 'n02412080_16830.png', 'n02412080_16254.png', 'n02412080_26458.png', 'n02412080_3944.png', 'n02412080_1040.png', 'n02412080_10804.png', 'n02412080_13818.png', 'n02412080_11852.png', 'n02412080_791.png', 'n02412080_17063.png']
Class: n02107574, Most Common Nonzero Grayscale Value (by presence): 12, Presence Count: 997
    Images without the most common grayscale value (12): ['n02107574_3660.png', 'n02107574_142.png', 'n02107574_690.png']
Class: n01833805, Most Common Nonzero Grayscale Value (by presence): 3, Presence Count: 987
    Images without the most common grayscale value (3): ['n01833805_4117.png', 'n01833805_166.png', 'n01833805_8510.png', 'n01833805_885

In [None]:
import pickle
def save_df(df):
    # Save the DataFrame to a pickle file
    with open('processed_image_data.pkl', 'wb') as f:
        pickle.dump(df, f)

def load_df(path):
    # Load the DataFrame from the pickle file
    with open(path, 'rb') as f:
        return pickle.load(f)
