In [70]:
import sqlite3
from pathlib import Path
import pandas as pd
import logging
import shutil
import cv2
import numpy as np
pd.set_option('display.max_rows', None)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
def get_balanced_face_samples():
    conn = sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db')
    df = pd.read_sql_query("""
    WITH RankedFaces AS (
        SELECT 
            v.video_path,
            d.frame_number,
            d.confidence_score,
            d.proximity,
            d.object_class,
            d.x_min,
            d.y_min, 
            d.x_max,
            d.y_max,
            CAST(d.proximity * 10 AS INTEGER) as proximity_bin
        FROM Detections d
        JOIN Videos v ON d.video_id = v.video_id
        WHERE d.object_class IN (2, 3)
            AND d.proximity BETWEEN 0 AND 1
        ),
        SampledFaces AS (
        SELECT *,
            ROW_NUMBER() OVER (
            PARTITION BY 
                CASE WHEN object_class = 3 THEN 'adult' 
                     WHEN object_class = 2 THEN 'child' 
                END,
                proximity_bin
            ORDER BY RANDOM()
            ) as rn
        FROM RankedFaces
        )
        SELECT 
        '/home/nele_pauline_suffo/ProcessedData/quantex_videos_processed/' || video_path || '/' || 
        video_path || '_' || printf('%06d', frame_number) || '.jpg' as frame_file_name,
        confidence_score,
        proximity,
        proximity_bin,
        x_min,
        y_min,
        x_max,
        y_max,
        CASE WHEN object_class = 3 THEN 'adult' 
             WHEN object_class = 2 THEN 'child' 
        END as age_group,
        CAST(proximity_bin/10.0 AS TEXT) || '-' || CAST((proximity_bin + 1)/10.0 AS TEXT) as proximity_range
        FROM SampledFaces 
        WHERE rn <= 10
        ORDER BY age_group, proximity_bin, proximity;
    """, conn)
    
    # Save to CSV with bounding box information
    output_path = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames/proximity_samples.csv')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
        
    conn.close()
    return df

def copy_face_samples(df):
    # First verify we have data
    if df.empty:
        logging.error("DataFrame is empty - no samples to copy")
        return
        
    logging.info(f"Processing {len(df)} samples")
    logging.info(f"Distribution:\n{df.groupby('age_group').size()}")
    
    # Define output directories
    base_dir = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames')
    adult_dir = base_dir / 'adult_faces'
    child_dir = base_dir / 'child_faces'
    
    # Create directories if they don't exist
    adult_dir.mkdir(parents=True, exist_ok=True)
    child_dir.mkdir(parents=True, exist_ok=True)
    
    # Copy files for each age group
    copied_count = {'adult': 0, 'child': 0}
    error_count = {'adult': 0, 'child': 0}
    
    # Define colors for bounding boxes (BGR format)
    colors = {
        'adult': (0, 255, 0),  # Green for adult
        'child': (0, 0, 255)   # Red for child
    }
    
    # Create random orders for each age group
    for age_group in ['adult', 'child']:
        age_df = df[df['age_group'] == age_group].copy()
        # Create random order indices
        random_indices = np.random.permutation(len(age_df))
        age_df['random_prefix'] = [f"{i:03d}" for i in random_indices]
        
        target_dir = adult_dir if age_group == 'adult' else child_dir
        
        for _, row in age_df.iterrows():
            src_path = Path(row['frame_file_name'])
            
            if not src_path.exists():
                logging.warning(f"Source file not found: {src_path}")
                error_count[age_group] += 1
                continue
                
            try:
                # Read the image
                img = cv2.imread(str(src_path))
                if img is None:
                    logging.error(f"Could not read image: {src_path}")
                    error_count[age_group] += 1
                    continue
                    
                # Draw bounding box
                x1, y1, x2, y2 = int(row['x_min']), int(row['y_min']), int(row['x_max']), int(row['y_max'])
                cv2.rectangle(img, (x1, y1), (x2, y2), colors[age_group], 2)
                
                # Create new filename with random prefix and proximity
                new_filename = f"{row['random_prefix']}_{src_path.name}"
                dst_path = target_dir / new_filename
                
                # Save the image with bounding box
                cv2.imwrite(str(dst_path), img)
                
                copied_count[age_group] += 1
                if copied_count[age_group] % 10 == 0:
                    logging.info(f"Copied {copied_count[age_group]} {age_group} face images")
                    
            except Exception as e:
                logging.error(f"Error processing {src_path}: {e}")
                error_count[age_group] += 1

    # Log final statistics
    for age_group in ['adult', 'child']:
        logging.info(f"{age_group.title()} faces - Copied: {copied_count[age_group]}, "
                    f"Errors: {error_count[age_group]}")
    
def create_empty_proximity_xlsx(df):
    base_dir = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames')
    
    # Create separate CSVs for adults and children based on actual files in folders
    for age_group in ['adult', 'child']:
        folder_path = base_dir / f'{age_group}_faces'
        
        if not folder_path.exists():
            logging.error(f"Folder not found: {folder_path}")
            continue
            
        # Get all jpg files from the folder
        frame_files = sorted(folder_path.glob('*.jpg'))
        
        # Create DataFrame with filenames and empty proximity column
        output_df = pd.DataFrame({
            'frame_file_name': [f.name for f in frame_files],
            'proximity': ''  # empty column for manual proximity values
        })
        
        # Save to CSV with age-specific filename
        output_path = base_dir / f'proximity_samples_fill_in_{age_group}.xlsx'
        output_df.to_excel(output_path, index=False)
        
        logging.info(f"Created {age_group} Excel file with {len(output_df)} frames from {folder_path}")

     video_id  frame_number object_class  proximity
0           1         12630            2   0.604128
1           1         15260            3   0.609826
2           1         15260            3   0.614815
3           1         15430            3   0.599250
4           1         15430            3   0.655555
..        ...           ...          ...        ...
261       112         18600            2   0.632843
262       117           130            3   0.941956
263       117          6320            2   0.271860
264       117         41000            2   1.000000
265       118         12490            3   0.446781

[266 rows x 4 columns]


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [32]:
heuristic_df = get_balanced_face_samples()

# remove everything before "quantex" in frame_file_name
heuristic_df['frame_file_name'] = heuristic_df['frame_file_name'].apply(lambda x: x.split('/')[-1])

heuristic_df.head()

Unnamed: 0,frame_file_name,confidence_score,proximity,proximity_bin,x_min,y_min,x_max,y_max,age_group,proximity_range
0,quantex_at_home_id263284_2023_06_25_06_043040.jpg,0.298487,0.056211,0,1185,239,1201,259,adult,0.0-0.1
1,quantex_at_home_id263284_2023_06_25_06_052590.jpg,0.499622,0.064886,0,576,355,595,373,adult,0.0-0.1
2,quantex_at_home_id263284_2023_06_25_06_029420.jpg,0.422904,0.076558,0,126,296,143,318,adult,0.0-0.1
3,quantex_at_home_id263284_2023_06_25_04_028350.jpg,0.342625,0.076558,0,1025,264,1042,286,adult,0.0-0.1
4,quantex_at_home_id263284_2023_06_25_06_029230.jpg,0.515524,0.076558,0,628,276,645,298,adult,0.0-0.1


In [3]:
# copy face samples to output directory
copy_face_samples(df)

2025-03-26 23:36:34,798 - INFO - Processing 210 samples
2025-03-26 23:36:34,802 - INFO - Distribution:
age_group
adult    110
child    100
dtype: int64
2025-03-26 23:36:35,758 - INFO - Copied 10 adult face images
2025-03-26 23:36:40,207 - INFO - Copied 20 adult face images
2025-03-26 23:36:42,240 - INFO - Copied 30 adult face images
2025-03-26 23:36:43,809 - INFO - Copied 40 adult face images
2025-03-26 23:36:48,123 - INFO - Copied 50 adult face images
2025-03-26 23:36:51,329 - INFO - Copied 60 adult face images
2025-03-26 23:36:53,497 - INFO - Copied 70 adult face images
2025-03-26 23:36:54,555 - INFO - Copied 80 adult face images
2025-03-26 23:36:57,375 - INFO - Copied 90 adult face images
2025-03-26 23:36:58,438 - INFO - Copied 100 adult face images
2025-03-26 23:36:59,226 - INFO - Copied 110 adult face images
2025-03-26 23:37:04,540 - INFO - Copied 10 child face images
2025-03-26 23:37:08,130 - INFO - Copied 20 child face images
2025-03-26 23:37:10,470 - INFO - Copied 30 child face

In [5]:
# create empty proximity CSV for manual input
create_empty_proximity_xlsx(df)

2025-03-27 19:56:29,317 - INFO - Created adult Excel file with 110 frames from /home/nele_pauline_suffo/outputs/proximity_sampled_frames/adult_faces
2025-03-27 19:56:29,333 - INFO - Created child Excel file with 100 frames from /home/nele_pauline_suffo/outputs/proximity_sampled_frames/child_faces


## Correlate the heuristic and estimation

In [103]:
# read xlsx file with proximity values
proximity_df_adult = pd.read_excel('/home/nele_pauline_suffo/outputs/proximity_sampled_frames/proximity_samples_fill_in.xlsx', sheet_name="Adult")
# add column type "adult"
proximity_df_child = pd.read_excel('/home/nele_pauline_suffo/outputs/proximity_sampled_frames/proximity_samples_fill_in.xlsx', sheet_name='Child')
proximity_df_adult['age_group'] = 'adult_faces'
proximity_df_child['age_group'] = 'child_faces'

# combine both dataframes
proximity_df = pd.concat([proximity_df_adult, proximity_df_child], ignore_index=True)

# remove the first four values of the frame_file_name column
proximity_df['frame_file_name'] = proximity_df['frame_file_name'].str[4:]
proximity_df.head()

Unnamed: 0,frame_file_name,proximity,age_group
0,quantex_at_home_id260123_2023_09_06_01_013290.jpg,0.89,adult_faces
1,quantex_at_home_id268898_2022_11_30_01_043160.jpg,0.85,adult_faces
2,quantex_at_home_id262726_2023_03_26_01_021500.jpg,0.52,adult_faces
3,quantex_at_home_id260275_2022_04_16_01_011640.jpg,0.23,adult_faces
4,quantex_at_home_id262726_2023_04_20_01_037740.jpg,0.65,adult_faces


In [106]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db')

# Extract video name and frame number from frame_file_name
proximity_df['video_name'] = proximity_df['frame_file_name'].apply(lambda x: '_'.join(x.split('_')[:-1]))
proximity_df['frame_number'] = proximity_df['frame_file_name'].apply(lambda x: int(x.split('_')[-1].replace('.jpg','').lstrip('0')))
proximity_df['object_class'] = proximity_df['age_group'].map({'child_faces': 2, 'adult_faces': 3})

# Get video IDs
video_query = """
SELECT DISTINCT video_id, video_path FROM Videos WHERE video_path IN ({})
""".format(','.join(['?']*len(proximity_df['video_name'].unique())))

video_ids = pd.read_sql_query(
    video_query,
    conn,
    params=tuple(proximity_df['video_name'].unique())
)

# Merge video IDs
proximity_check_df = proximity_df.merge(
    video_ids,
    left_on='video_name',
    right_on='video_path',
    how='left'
)

# Get proximities from detections
detection_query = """
SELECT 
    d.video_id,
    d.frame_number,
    d.object_class,
    d.proximity,
    ROW_NUMBER() OVER (PARTITION BY d.video_id, d.frame_number, d.object_class ORDER BY d.proximity) as proximity_num
FROM Detections d
WHERE d.video_id = ? 
AND d.frame_number = ? 
AND d.object_class = ?
"""

# Get all predicted proximities
all_proximities = []
for _, row in proximity_check_df.iterrows():
    result = pd.read_sql_query(
        detection_query, 
        conn, 
        params=(row['video_id'], row['frame_number'], row['object_class'])
    )
    
    # Create a dict with base columns
    row_dict = {
        'frame_file_name': row['frame_file_name'],
        'proximity': row['proximity'],
        'age_group': row['age_group']
    }
    
    # Add all found proximities with numbered columns
    for idx, prox in enumerate(result['proximity'], 1):
        row_dict[f'proximity_predicted_{idx}'] = prox
    
    all_proximities.append(row_dict)

final_df = pd.DataFrame(all_proximities)

# Close connection
conn.close()

# Keep only essential columns and remove duplicates
cols = ['frame_file_name', 'proximity', 'age_group'] + sorted([col for col in final_df.columns if col.startswith('proximity_predicted_')])
final_df = final_df[cols].drop_duplicates()

## Quality Check and Visualization

In [None]:
import sqlite3
import pandas as pd
from pathlib import Path
import cv2
import logging

def find_actual_file(base_path, frame_file_name):
    """Find the actual file with random prefix in the directory"""
    if not base_path.exists():
        return None
    
    # Get all files in directory
    for file in base_path.glob(f'*{frame_file_name}'):
        # Return first matching file (should be unique)
        return file
    return None

def draw_bounding_boxes_on_images(df):
    base_dir = Path('/home/nele_pauline_suffo/outputs/proximity_sampled_frames')
    output_dir = base_dir / 'proximity_multiple_faces_with_bboxes'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    colors = {
        'adult': (0, 255, 0),
        'child': (0, 0, 255)
    }
    
    # Connect to database
    conn = sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db')
    
    # Extract video name from frame_file_name (everything before the last underscore)
    df['video_name'] = df['frame_file_name'].apply(lambda x: '_'.join(x.split('_')[:-1]))
    
    # Get video_ids from Videos table
    video_query = """
    SELECT video_id, video_path FROM Videos WHERE video_path IN ({})
    """.format(','.join(['?'] * len(df['video_name'].unique())))
    
    video_ids = pd.read_sql_query(
        video_query,
        conn,
        params=tuple(df['video_name'].unique())
    )
    
    # Merge video_ids with the original dataframe
    df = df.merge(video_ids, left_on='video_name', right_on='video_path', how='left')
    
    # Extract frame number (last part after underscore, remove .jpg and leading zeros)
    df['frame_number'] = df['frame_file_name'].apply(lambda x: int(x.split('_')[-1].replace('.jpg','').lstrip('0')))
    
    # Map age_group to object_class
    df['object_class'] = df['age_group'].map({'child': 2, 'adult': 3})
    
    # Query to get detections with bboxes
    detection_query = """
    SELECT 
        d.video_id,
        d.frame_number,
        d.object_class,
        d.proximity,
        d.x_min,
        d.y_min,
        d.x_max,
        d.y_max,
        ROW_NUMBER() OVER (PARTITION BY d.video_id, d.frame_number, d.object_class ORDER BY d.proximity) as detection_num
    FROM Detections d
    WHERE d.video_id = ? 
    AND d.frame_number = ? 
    AND d.object_class = ?
    """
    
    # Process each frame
    all_detections = []
    for _, row in df.iterrows():
        result = pd.read_sql_query(
            detection_query,
            conn,
            params=(row['video_id'], row['frame_number'], row['object_class'])
        )
        
        frame_dict = {
            'frame_file_name': row['frame_file_name'],
            'age_group': row['age_group'],
            'video_id': row['video_id'],
            'frame_number': row['frame_number'],
            'object_class': row['object_class']
        }
        
        for idx, detection in result.iterrows():
            num = idx + 1
            frame_dict.update({
                f'proximity_predicted_{num}': detection['proximity'],
                f'x_min_{num}': detection['x_min'],
                f'y_min_{num}': detection['y_min'],
                f'x_max_{num}': detection['x_max'],
                f'y_max_{num}': detection['y_max']
            })
        
        all_detections.append(frame_dict)
    
    for _, row in df.iterrows():
        # Get all detections for this frame
        result = pd.read_sql_query(
            detection_query,
            conn,
            params=(row['video_id'], row['frame_number'], row['object_class'])
        )
        
        # Create base dictionary
        frame_dict = {
            'frame_file_name': row['frame_file_name'],
            'age_group': row['age_group']
        }
        
        # Add all detections with numbered columns
        for idx, detection in result.iterrows():
            num = idx + 1
            frame_dict.update({
                f'proximity_predicted_{num}': detection['proximity'],
                f'x_min_{num}': detection['x_min'],
                f'y_min_{num}': detection['y_min'],
                f'x_max_{num}': detection['x_max'],
                f'y_max_{num}': detection['y_max']
            })
        
        all_detections.append(frame_dict)
        
    # Convert to DataFrame
    detections_df = pd.DataFrame(all_detections)

    # Draw bounding boxes using the updated information
    for _, row in detections_df.iterrows():
        base_folder = base_dir / row['age_group']
        src_path = find_actual_file(base_folder, row['frame_file_name'])
        
        if src_path is None:
            logging.warning(f"Source file not found for: {row['frame_file_name']}")
            continue
            
        try:
            img = cv2.imread(str(src_path))
            if img is None:
                logging.error(f"Could not read image: {src_path}")
                continue
            
            # Draw all detected bounding boxes
            for i in range(1, len([col for col in row.index if col.startswith('proximity_predicted')]) + 1):
                if f'proximity_predicted_{i}' in row and not pd.isna(row[f'proximity_predicted_{i}']):
                    x1 = int(row[f'x_min_{i}'])
                    y1 = int(row[f'y_min_{i}'])
                    x2 = int(row[f'x_max_{i}'])
                    y2 = int(row[f'y_max_{i}'])
                    proximity = row[f'proximity_predicted_{i}']
                    
                    cv2.rectangle(img, (x1, y1), (x2, y2), colors[row['age_group']], 2)
                    cv2.putText(img, f'{proximity:.2f}', (x1, y1 - 10), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[row['age_group']], 2)
            
            dst_path = output_dir / row['frame_file_name']
            cv2.imwrite(str(dst_path), img)
            
        except Exception as e:
            logging.error(f"Error processing {src_path}: {e}")

    conn.close()

    return detections_df

In [108]:
final_df = final_df.sort_values(by='proximity_predicted_2', ascending=False)

# collect all frame_file_names that have a proximity_predicted_2 value in a list (exclude NaN values)+
check_multiple_faces_df = final_df[~final_df['proximity_predicted_2'].isna()]
mutiple_faces_list = check_multiple_faces_df['frame_file_name'].tolist()
len(mutiple_faces_list)

40

In [109]:
draw_bounding_boxes_on_images(check_multiple_faces_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['video_name'] = df['frame_file_name'].apply(lambda x: '_'.join(x.split('_')[:-1]))


Unnamed: 0,frame_file_name,age_group,video_id,frame_number,object_class
0,quantex_at_home_id262356_2023_09_10_01_023030.jpg,child_faces,66.0,23030.0,
1,quantex_at_home_id261609_2022_04_15_02_011580.jpg,adult_faces,91.0,11580.0,
2,quantex_at_home_id271611_2024_09_08_03_035360.jpg,adult_faces,108.0,35360.0,
3,quantex_at_home_id261609_2022_04_15_02_026780.jpg,adult_faces,91.0,26780.0,
4,quantex_at_home_id263293_2022_08_29_01_049840.jpg,adult_faces,1.0,49840.0,
5,quantex_at_home_id263293_2022_08_29_01_045820.jpg,adult_faces,1.0,45820.0,
6,quantex_at_home_id262356_2023_09_10_01_004850.jpg,adult_faces,66.0,4850.0,
7,quantex_at_home_id263986_2022_12_05_01_017730.jpg,adult_faces,19.0,17730.0,
8,quantex_at_home_id263293_2022_08_29_01_035870.jpg,adult_faces,1.0,35870.0,
9,quantex_at_home_id263293_2022_08_29_01_032150.jpg,adult_faces,1.0,32150.0,


In [None]:
# add tge absolute difference between proximity and proximity_predicted in a new column
proximity_estimation_check_df['absolute_difference'] = (proximity_estimation_check_df['proximity'] - proximity_estimation_check_df['proximity_predicted']).abs()
# sort by absolute difference descending
proximity_estimation_check_df = proximity_estimation_check_df.sort_values(by='absolute_difference', ascending=False)
aproximity_estimation_check_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  proximity_estimation_check_df['absolute_difference'] = (proximity_estimation_check_df['proximity'] - proximity_estimation_check_df['proximity_predicted']).abs()


Unnamed: 0,frame_file_name,age_group,proximity,proximity_predicted,absolute_difference
77,quantex_at_home_id260275_2022_04_16_01_031820.jpg,adult,0.06,0.760735,0.700735
68,quantex_at_home_id271700_2023_03_27_01_026440.jpg,adult,0.32,1.000000,0.680000
15,quantex_at_home_id271611_2024_09_08_03_035360.jpg,adult,0.95,0.343727,0.606273
109,quantex_at_home_id257609_2022_11_09_01_011320.jpg,adult,0.41,1.000000,0.590000
52,quantex_at_home_id264351_2024_11_23_03_010000.jpg,adult,0.42,1.000000,0.580000
...,...,...,...,...,...
159,quantex_at_home_id268898_2022_11_30_01_015450.jpg,child,0.98,0.975912,0.004088
48,quantex_at_home_id260178_2023_08_12_03_042040.jpg,adult,0.84,0.843580,0.003580
178,quantex_at_home_id264089_2023_05_14_01_042700.jpg,child,0.91,0.907182,0.002818
83,quantex_at_home_id263986_2022_12_05_01_000190.jpg,adult,0.99,0.989944,0.000056


In [None]:
# can you correlate the proximity values with the proximity_predicted values?
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set the style of seaborn
sns.set(style="whitegrid")
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=proximity_df, x='proximity_x', y='proximity_predicted', hue='age_group', alpha=0.7)
plt.title('Correlation between Proximity Values')
plt.xlabel('Proximity (manual)')
plt.ylabel('Proximity (predicted)')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.axhline(y=0.5, color='r', linestyle='--')
plt.axvline(x=0.5, color='r', linestyle='--')
plt.grid()
plt.legend(title='Age Group')
plt.show()
# Calculate correlation
correlation = proximity_estimation_check_df[['proximity_x', 'proximity_predicted']].corr().iloc[0, 1]
logging.info(f"Correlation between manual and predicted proximity values: {correlation:.2f}")