In [28]:
#load csv file into a pandas dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

conn = sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db')
c = conn.cursor()

## Add additional information to database

## Statistics

In [52]:
# query sum of number of frames from all videos in database
c.execute('SELECT SUM(total_frames) FROM VideoStatistics')
total_frames = c.fetchone()[0]
total_frames

19023571

In [49]:
# save frames_with_detections and video_birthdays to the same csv file
frames_with_detections.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/frames_with_detections.csv', index=False)
video_birthdays.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/video_birthdays.csv', index=False)

## Plotting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import multipletests

# First, get raw data with proper summed normalization
raw_data_query = """
SELECT 
    age_group,
    video_id,
    CAST(child_count AS FLOAT) / total_frames as norm_child_count,
    CAST(adult_count AS FLOAT) / total_frames as norm_adult_count,
    CAST(child_face_count AS FLOAT) / total_frames as norm_child_face_count,
    CAST(adult_face_count AS FLOAT) / total_frames as norm_adult_face_count,
    CAST(toy_count AS FLOAT) / total_frames as norm_toy_count,
    CAST(book_count AS FLOAT) / total_frames as norm_book_count,
    CAST(kitchenware_count AS FLOAT) / total_frames as norm_kitchenware_count,
    CAST(screen_count AS FLOAT) / total_frames as norm_screen_count,
    CAST(other_object_count AS FLOAT) / total_frames as norm_other_object_count
FROM VideoStatistics
WHERE age_group IS NOT NULL;
"""

raw_data = pd.read_sql(raw_data_query, conn)

# Define classes and display names
classes = ['norm_child_count', 'norm_adult_count', 'norm_child_face_count',
           'norm_adult_face_count', 'norm_toy_count', 'norm_book_count',
           'norm_kitchenware_count', 'norm_screen_count', 'norm_other_object_count']

class_display_names = {
    'norm_child_count': 'Child',
    'norm_adult_count': 'Adult',
    'norm_child_face_count': 'Child Face',
    'norm_adult_face_count': 'Adult Face',
    'norm_toy_count': 'Toy',
    'norm_book_count': 'Book',
    'norm_kitchenware_count': 'Kitchenware',
    'norm_screen_count': 'Screen',
    'norm_other_object_count': 'Other Objects'
}

# Perform Kruskal-Wallis tests and track significant classes
test_results = []
significant_classes = []

for class_name in classes:
    groups = [group[class_name].values 
             for name, group in raw_data.groupby('age_group')]
    h_stat, p_val = stats.kruskal(*groups)
    
    test_results.append({
        'class_name': class_name,
        'display_name': class_display_names[class_name],
        'h_statistic': h_stat,
        'p_value': p_val
    })
    
    if p_val < 0.05:
        significant_classes.append(class_name)

significance_results = pd.DataFrame(test_results)

# Perform pairwise tests only for significant classes with Bonferroni correction
def perform_pairwise_tests(data, class_name, display_name):
    age_groups = sorted(data['age_group'].unique())
    results = []
    
    for i in range(len(age_groups)):
        for j in range(i+1, len(age_groups)):  
            group1 = data[data['age_group'] == age_groups[i]][class_name]
            group2 = data[data['age_group'] == age_groups[j]][class_name]
            
            stat, pval = stats.mannwhitneyu(group1, group2, alternative='two-sided')
            
            results.append({
                'Class': display_name,
                'Group 1': age_groups[i],
                'Group 2': age_groups[j],
                'Statistic': stat,
                'p-value': pval
            })
    
    return results

# Collect and correct pairwise results
all_pairwise_results = []
for class_name in significant_classes:
    display_name = class_display_names[class_name]
    results = perform_pairwise_tests(raw_data, class_name, display_name)
    all_pairwise_results.extend(results)

pairwise_results_df = pd.DataFrame(all_pairwise_results)

# Apply Bonferroni correction
pairwise_results_df['p_adjusted'] = multipletests(
    pairwise_results_df['p-value'], method='bonferroni'
)[1]

# Add significance labels
pairwise_results_df['Significance'] = np.where(
    pairwise_results_df['p_adjusted'] < 0.001, '***',
    np.where(pairwise_results_df['p_adjusted'] < 0.01, '**',
    np.where(pairwise_results_df['p_adjusted'] < 0.05, '*', 'ns'))
)

# Prepare melted data for plotting
normalized_results_melted = pd.melt(
    raw_data,
    id_vars=['age_group'],
    value_vars=[c for c in classes],
    var_name='class',
    value_name='normalized_count'
)
normalized_results_melted['display_name'] = normalized_results_melted['class'].map(class_display_names)

# Create annotation dictionary from significant pairwise results
sig_pairwise = pairwise_results_df[pairwise_results_df['Significance'] != 'ns']
annotation_dict = {}
for _, row in sig_pairwise.iterrows():
    key = (row['Class'], row['Group 1'], row['Group 2'])
    annotation_dict[key] = row['Significance']

# Define custom colors for age groups
age_group_colors = {
    3: '#8D8E3D',  # leuphana green
    4: '#691633',  # leuphana red
    5: '#B3B6B0',  # leuphana blue
}
# Create visualization with significance bars
plt.figure(figsize=(18, 10))
ax = sns.barplot(
    x='display_name',
    y='normalized_count',
    hue='age_group',
    data=normalized_results_melted,
    order=class_display_names.values(),
    palette=age_group_colors
)

# Set fixed y-axis limit to 0.04
y_max_fixed = 0.055
ax.set_ylim(0, y_max_fixed)

# Define class-specific base heights
class_base_heights = {
    'Child': 0.045,         
    'Adult': 0.04,         
    'Child Face': 0.012,   
    'Adult Face': 0.04,    
    'Toy': 0.013,          
    'Book': 0.008,         
    'Kitchenware': 0.003,  
    'Screen': 0.04,      
    'Other Objects': 0.04
}

# Define spacing parameters
bracket_step = 0.0015  # Space between stacked brackets

# Track current height for each class to avoid overlaps
class_heights = {class_name: base_height for class_name, base_height in class_base_heights.items()}

# Get x positions for each category
x_positions = np.arange(len(class_display_names))
bar_width = 0.8 / len(raw_data['age_group'].unique())

# Loop through significant comparisons and draw brackets
for i, class_name in enumerate(class_display_names.values()):
    for j in range(len(age_groups)):
        for k in range(j+1, len(age_groups)):
            group1 = age_groups[j]
            group2 = age_groups[k]
            
            if (class_name, group1, group2) in annotation_dict:
                significance = annotation_dict[(class_name, group1, group2)]
                
                # Calculate x positions for bars
                x_pos1 = x_positions[i] + (j - len(age_groups)/2 + 0.5) * bar_width
                x_pos2 = x_positions[i] + (k - len(age_groups)/2 + 0.5) * bar_width
                
                # Get the current height for this class and increment for next
                y_pos = class_heights[class_name]
                class_heights[class_name] += bracket_step
                
                # Draw the bracket
                ax.plot([x_pos1, x_pos1, x_pos2, x_pos2], 
                        [y_pos, y_pos + bracket_step * 0.3, y_pos + bracket_step * 0.3, y_pos], 
                        color='black', linewidth=1)
                
                # Add the significance label
                ax.text((x_pos1 + x_pos2) / 2, y_pos + bracket_step * 0.2, significance, 
                       ha='center', va='bottom')

total_videos = len(raw_data['video_id'].unique())
total_frames_query = """
SELECT SUM(total_frames) as total_frames
FROM VideoStatistics
WHERE age_group IS NOT NULL;
"""
total_frames = pd.read_sql(total_frames_query, conn).iloc[0]['total_frames']

plt.title(f'Comparison of YOLO11x Object Detection Classes Across Age Groups\n(Normalized Counts per Frame)\n'
          f'Applied on Quantex Dataset: {total_videos:,} Videos, {total_frames:,} Frames')
plt.xlabel('')  # Remove x-axis label
plt.ylabel('Normalized Counts (Objects per Frame)')
plt.xticks(rotation=45)
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/home/nele_pauline_suffo/outputs/detection_pipeline_results/quantex_normalized_counts.png')

plt.show()
# save the figure

# Display filtered results
print("## Significant Kruskal-Wallis Results (p < 0.05) ##")
display(significance_results[significance_results['p_value'] < 0.05]
        [['display_name', 'h_statistic', 'p_value']].round(4))

print("\nRaw Pairwise Comparisons:")
# display full p-values for as 0.0000
pd.options.display.float_format = '{:.4f}'.format
display(pairwise_results_df[['Class', 'Group 1', 'Group 2', 'p-value']])

print("\n## Significant Pairwise Comparisons (Bonferroni-corrected) ##")
display(sig_pairwise[['Class', 'Group 1', 'Group 2', 
                     'Statistic', 'p_adjusted', 'Significance']].round(4))


In [None]:
# Close connection
conn.close()