In [1]:
import sqlite3
from pathlib import Path
import pandas as pd
import logging
import shutil
import cv2
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def get_balanced_face_samples():
    conn = sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db')
    df = pd.read_sql_query("""
    WITH RankedFaces AS (
        SELECT 
            v.video_path,
            d.frame_number,
            d.confidence_score,
            d.proximity,
            d.object_class,
            d.x_min,
            d.y_min, 
            d.x_max,
            d.y_max,
            CAST(d.proximity * 10 AS INTEGER) as proximity_bin
        FROM Detections d
        JOIN Videos v ON d.video_id = v.video_id
        WHERE d.object_class IN (2, 3)
            AND d.proximity BETWEEN 0 AND 1
        ),
        SampledFaces AS (
        SELECT *,
            ROW_NUMBER() OVER (
            PARTITION BY 
                CASE WHEN object_class = 3 THEN 'adult' 
                     WHEN object_class = 2 THEN 'child' 
                END,
                proximity_bin
            ORDER BY RANDOM()
            ) as rn
        FROM RankedFaces
        )
        SELECT 
        '/home/nele_pauline_suffo/ProcessedData/quantex_videos_processed/' || video_path || '/' || 
        video_path || '_' || printf('%06d', frame_number) || '.jpg' as frame_file_name,
        confidence_score,
        proximity,
        proximity_bin,
        x_min,
        y_min,
        x_max,
        y_max,
        CASE WHEN object_class = 3 THEN 'adult' 
             WHEN object_class = 2 THEN 'child' 
        END as age_group,
        CAST(proximity_bin/10.0 AS TEXT) || '-' || CAST((proximity_bin + 1)/10.0 AS TEXT) as proximity_range
        FROM SampledFaces 
        WHERE rn <= 10
        ORDER BY age_group, proximity_bin, proximity;
    """, conn)
    
    # Save to CSV with bounding box information
    output_path = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames/proximity_samples.csv')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
        
    conn.close()
    return df

def copy_face_samples(df):
    # First verify we have data
    if df.empty:
        logging.error("DataFrame is empty - no samples to copy")
        return
        
    logging.info(f"Processing {len(df)} samples")
    logging.info(f"Distribution:\n{df.groupby('age_group').size()}")
    
    # Define output directories
    base_dir = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames')
    adult_dir = base_dir / 'adult_faces'
    child_dir = base_dir / 'child_faces'
    
    # Create directories if they don't exist
    adult_dir.mkdir(parents=True, exist_ok=True)
    child_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy files for each age group
    copied_count = {'adult': 0, 'child': 0}
    error_count = {'adult': 0, 'child': 0}
    
    # Define colors for bounding boxes (BGR format)
    colors = {
        'adult': (0, 255, 0),  # Green for adult
        'child': (0, 0, 255)   # Red for child
    }
    
    # Create random orders for each age group
    for age_group in ['adult', 'child']:
        age_df = df[df['age_group'] == age_group].copy()
        # Create random order indices
        random_indices = np.random.permutation(len(age_df))
        age_df['random_prefix'] = [f"{i:03d}" for i in random_indices]
        
        target_dir = adult_dir if age_group == 'adult' else child_dir
        
        for _, row in age_df.iterrows():
            src_path = Path(row['frame_file_name'])
            
            if not src_path.exists():
                logging.warning(f"Source file not found: {src_path}")
                error_count[age_group] += 1
                continue
                
            try:
                # Read the image
                img = cv2.imread(str(src_path))
                if img is None:
                    logging.error(f"Could not read image: {src_path}")
                    error_count[age_group] += 1
                    continue
                    
                # Draw bounding box
                x1, y1, x2, y2 = int(row['x_min']), int(row['y_min']), int(row['x_max']), int(row['y_max'])
                cv2.rectangle(img, (x1, y1), (x2, y2), colors[age_group], 2)
                
                # Create new filename with random prefix and proximity
                new_filename = f"{row['random_prefix']}_{src_path.name}"
                dst_path = target_dir / new_filename
                
                # Save the image with bounding box
                cv2.imwrite(str(dst_path), img)
                
                copied_count[age_group] += 1
                if copied_count[age_group] % 10 == 0:
                    logging.info(f"Copied {copied_count[age_group]} {age_group} face images")
                    
            except Exception as e:
                logging.error(f"Error processing {src_path}: {e}")
                error_count[age_group] += 1

    # Log final statistics
    for age_group in ['adult', 'child']:
        logging.info(f"{age_group.title()} faces - Copied: {copied_count[age_group]}, "
                    f"Errors: {error_count[age_group]}")
    
def create_empty_proximity_csv(df):
    base_dir = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames')
    
    # Create separate CSVs for adults and children based on actual files in folders
    for age_group in ['adult', 'child']:
        folder_path = base_dir / f'{age_group}_faces'
        
        if not folder_path.exists():
            logging.error(f"Folder not found: {folder_path}")
            continue
            
        # Get all jpg files from the folder
        frame_files = sorted(folder_path.glob('*.jpg'))
        
        # Create DataFrame with filenames and empty proximity column
        output_df = pd.DataFrame({
            'frame_file_name': [f.name for f in frame_files],
            'proximity': ''  # empty column for manual proximity values
        })
        
        # Save to CSV with age-specific filename
        output_path = base_dir / f'proximity_samples_fill_in_{age_group}.csv'
        output_df.to_csv(output_path, index=False)
        
        logging.info(f"Created {age_group} CSV with {len(output_df)} frames from {folder_path}")

In [2]:
df = get_balanced_face_samples()
df.head()

Unnamed: 0,frame_file_name,confidence_score,proximity,proximity_bin,x_min,y_min,x_max,y_max,age_group,proximity_range
0,/home/nele_pauline_suffo/ProcessedData/quantex...,0.497245,0.056211,0,59,351,75,371,adult,0.0-0.1
1,/home/nele_pauline_suffo/ProcessedData/quantex...,0.387124,0.057428,0,378,305,395,324,adult,0.0-0.1
2,/home/nele_pauline_suffo/ProcessedData/quantex...,0.515524,0.076558,0,628,276,645,298,adult,0.0-0.1
3,/home/nele_pauline_suffo/ProcessedData/quantex...,0.411968,0.084016,0,46,314,64,336,adult,0.0-0.1
4,/home/nele_pauline_suffo/ProcessedData/quantex...,0.631498,0.084016,0,545,418,563,440,adult,0.0-0.1


In [3]:
# copy face samples to output directory
copy_face_samples(df)

2025-03-26 23:36:34,798 - INFO - Processing 210 samples
2025-03-26 23:36:34,802 - INFO - Distribution:
age_group
adult    110
child    100
dtype: int64
2025-03-26 23:36:35,758 - INFO - Copied 10 adult face images
2025-03-26 23:36:40,207 - INFO - Copied 20 adult face images
2025-03-26 23:36:42,240 - INFO - Copied 30 adult face images
2025-03-26 23:36:43,809 - INFO - Copied 40 adult face images
2025-03-26 23:36:48,123 - INFO - Copied 50 adult face images
2025-03-26 23:36:51,329 - INFO - Copied 60 adult face images
2025-03-26 23:36:53,497 - INFO - Copied 70 adult face images
2025-03-26 23:36:54,555 - INFO - Copied 80 adult face images
2025-03-26 23:36:57,375 - INFO - Copied 90 adult face images
2025-03-26 23:36:58,438 - INFO - Copied 100 adult face images
2025-03-26 23:36:59,226 - INFO - Copied 110 adult face images
2025-03-26 23:37:04,540 - INFO - Copied 10 child face images
2025-03-26 23:37:08,130 - INFO - Copied 20 child face images
2025-03-26 23:37:10,470 - INFO - Copied 30 child face

In [5]:
# create empty proximity CSV for manual input
create_empty_proximity_csv(df)

2025-03-26 23:37:48,432 - INFO - Created adult CSV with 110 frames from /home/nele_pauline_suffo/outputs/proximity_sampled_frames/adult_faces
2025-03-26 23:37:48,436 - INFO - Created child CSV with 100 frames from /home/nele_pauline_suffo/outputs/proximity_sampled_frames/child_faces
