### Add audio results to output.db

In [None]:
# read pkl file
quantex_results = pd.read_pickle("/home/nele_pauline_suffo/outputs/vtc/quantex_df.pkl")
# remove _16khz from audio_file_name 
quantex_results['audio_file_name'] = quantex_results['audio_file_name'].str.replace('_16kHz', '', regex=False)

#quantex_results = df[0:10]

# Load frame-wise detection results and video info from the database
db_path = '/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db'
with sqlite3.connect(db_path) as conn:
    frame_df = pd.read_sql_query("SELECT * FROM Detections", conn)
    videos_info_df = pd.read_sql_query("SELECT video_id, video_path FROM Videos", conn)

# Merge video_id into quantex_results
quantex_results = pd.merge(quantex_results, videos_info_df[['video_id', 'video_path']], left_on='audio_file_name', right_on='video_path', how='left')

# Map RTTM annotations to frames
fps = 30  # Assuming a frame rate of 30 FPS

# Initialize new columns for speaker types in frame_df
speaker_types = ['KCHI', 'FEM', 'MAL', 'CHI']  # Define target speaker type columns
for speaker_col in speaker_types:
    frame_df[speaker_col] = 0

# Assign speaker annotations to frames for KCHI, CDS, OHS
if not quantex_results.empty:  # Proceed only if quantex_results has data after merge
    for _, rttm_row in quantex_results.iterrows():
        rttm_video_id = rttm_row['video_id']
        speaker_label_from_rttm = rttm_row['Voice_type']  # Label from RTTM 'Speaker' column
        utterance_start_time = rttm_row['Utterance_Start']
        utterance_end_time = rttm_row['Utterance_End']

        # Determine which speaker type column to update
        if speaker_label_from_rttm not in speaker_types:
            continue  # Skip unknown speaker types

        # Convert time to frame numbers (inclusive)
        start_frame = int(utterance_start_time * fps)
        end_frame = int(utterance_end_time * fps)

        #print(f"Convert {utterance_start_time} - {utterance_end_time} for video {rttm_video_id} with speaker type {speaker_label_from_rttm} to {start_frame} - {end_frame}")

        # Set the speaker label to 1 for all frames in range
        frame_mask = (
            (frame_df['video_id'] == rttm_video_id) &
            (frame_df['frame_number'] >= start_frame) &
            (frame_df['frame_number'] <= end_frame)
        )
        frame_df.loc[frame_mask, speaker_label_from_rttm] = 1

# To view the first few rows of the modified frame_df:
# print("Modified frame_df head:")
# print(frame_df.head())

# To view rows where speaker types are active:
# print("\nFrames with speaker activity:")
# print(frame_df[(frame_df['KCHI'] == 1) | (frame_df['CDS'] == 1) | (frame_df['OHS'] == 1)].head())

#Save the updated frame-wise detection results back to the database (optional)
db_path = '/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db'
with sqlite3.connect(db_path) as conn:
    # Save the new table
    frame_df.to_sql('Detections_with_speaker', conn, if_exists='replace', index=False)

    # List all tables to confirm it's there
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
    print(tables)

## 1. How many utterances does the key child produce?


In [2]:
import sqlite3
import pandas as pd
import os

# show all columns in pandas DataFrame
pd.set_option('display.max_columns', None)

In [30]:

query_01 = """
WITH RandomSubjects AS (
    SELECT DISTINCT s.child_id, s.video_name, v.video_id, s.age_at_recording
    FROM Subjects s
    JOIN Videos v ON s.video_name = v.video_path
    ORDER BY RANDOM()
    LIMIT 40
),
-- Ziehe alle möglichen Frames aus den Videos der RandomSubjects
AllFrames AS (
    SELECT 
        rs.child_id, 
        rs.video_id, 
        rs.age_at_recording,
        f.frame_number
    FROM RandomSubjects rs
    JOIN (
        -- Hier alle möglichen Frames pro Video bestimmen
        SELECT video_id, frame_number
        FROM Detections_with_speaker
        GROUP BY video_id, frame_number
    ) f ON rs.video_id = f.video_id
),
-- Ziehe zufällig 5000 Frames
RandomFrames AS (
    SELECT *
    FROM AllFrames
    ORDER BY RANDOM()
    LIMIT 5000
)
SELECT 
    rf.video_id,
    rf.frame_number,
    rf.child_id,
    rf.age_at_recording AS age,
    d.proximity,
    CASE
        WHEN CAST(d.object_class AS INTEGER) >= 5 AND CAST(d.object_class AS INTEGER) <= 10 THEN d.object_class
        ELSE 'none'
    END AS object_class,
    COALESCE(d.gaze_direction, 'none') AS gaze_direction,
    COALESCE(d.KCHI, 0) AS kchi_present,
    COALESCE(CASE WHEN d.FEM = 1 OR d.CHI = 1 OR d.MAL = 1 THEN 1 ELSE 0 END, 0) AS cds_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 2 OR CAST(d.object_class AS INTEGER) = 3 THEN 1 ELSE 0 END, 0) AS face_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 0 THEN 1 ELSE 0 END, 0) AS child_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 1 THEN 1 ELSE 0 END, 0) AS adult_present,
    CASE 
        WHEN 
            COALESCE(d.FEM, 0) = 1 OR COALESCE(d.CHI, 0) = 1 OR COALESCE(d.MAL, 0) = 1
            OR COALESCE(d.proximity, 0) > 0.5
            OR COALESCE(CAST(d.object_class AS INTEGER), -1) = 0
            OR COALESCE(CAST(d.object_class AS INTEGER), -1) = 1
            THEN 'social'
        ELSE 'alone'
    END AS play_context,
    CASE 
        WHEN COALESCE(CAST(d.object_class AS INTEGER), -1) = 1 THEN 'adult'
        WHEN COALESCE(CAST(d.object_class AS INTEGER), -1) = 0 THEN 'child'
        ELSE 'none'
    END AS person_age_class
FROM RandomFrames rf
LEFT JOIN Detections_with_speaker d
    ON rf.video_id = d.video_id
   AND rf.frame_number = d.frame_number
ORDER BY rf.child_id, rf.video_id, rf.frame_number;
"""

def merge_duplicates_01(df):
    # Mapping for object_class
    object_class_map = {
        '5': 'book',
        '6': 'toy',
        '7': 'kitchenware',
        '8': 'screen',
        '9': 'food',
        '10': 'other object'
    }

    grouped = df.groupby(['video_id', 'frame_number', 'child_id', 'age'])
    merged_rows = []

    for name, group in grouped:
        merged_row = {
            'video_id': name[0],
            'frame_number': name[1],
            'child_id': name[2],
            'age': name[3],
            'kchi_present': int(group['kchi_present'].any()),
            'cds_present': int(group['cds_present'].any()),
            'face_present': int(group['face_present'].any()),
            'child_present': int(group['child_present'].any()),
            'adult_present': int(group['adult_present'].any()),
            'play_context': (
                'social' if 'social' in group['play_context'].values
                else 'alone' if all(pc == 'alone' for pc in group['play_context'].values)
                else 'none'
            )
        }

        # --- object_class: only one column, mapping, 'multiple' if >1 ---
        raw_object_classes = [str(v) for v in group['object_class'].unique() if v not in ('none', 'NaN')]
        mapped_object_classes = [object_class_map.get(oc, oc) for oc in raw_object_classes]
        mapped_object_classes = [oc for oc in mapped_object_classes if oc not in ('none', 'NaN')]
        if len(mapped_object_classes) == 1:
            merged_row['object_class'] = mapped_object_classes[0]
        elif len(mapped_object_classes) > 1:
            merged_row['object_class'] = 'multiple'
        else:
            merged_row['object_class'] = 'none'

        # --- person_age_class logic ---
        pac_set = set(v for v in group['person_age_class'].unique() if v not in ('none', 'NaN'))
        if pac_set == {'child'}:
            merged_row['person_age_class'] = 'child'
        elif pac_set == {'adult'}:
            merged_row['person_age_class'] = 'adult'
        elif pac_set == {'child', 'adult'}:
            merged_row['person_age_class'] = 'both'
        else:
            merged_row['person_age_class'] = 'none'

        # --- proximity and gaze_direction: take max proximity and corresponding gaze ---
        proximities = pd.to_numeric(group['proximity'], errors='coerce').fillna(-1)
        if not proximities.empty and proximities.max() >= 0:
            max_prox_idx = proximities.idxmax()
            max_prox_value = proximities.loc[max_prox_idx]
            merged_row['proximity'] = max_prox_value
            gaze_value = group.loc[max_prox_idx, 'gaze_direction']
            if gaze_value in ('none', 'NaN', None):
                gaze_value = 'none'
            merged_row['gaze_direction'] = gaze_value
        else:
            merged_row['proximity'] = None
            merged_row['gaze_direction'] = 'none'

        merged_rows.append(merged_row)

    return pd.DataFrame(merged_rows)


In [None]:
# Load data
with sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db') as conn:
    utterance_01_df = pd.read_sql_query(query_01, conn)

utterance_01_df_merged = merge_duplicates_01(utterance_01_df)
utterance_01_df_merged.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/utterance_01_df.csv', index=False)

utterance_01_df_merged.head(10)

Unnamed: 0,video_id,frame_number,child_id,age,kchi_present,cds_present,face_present,child_present,adult_present,play_context,object_class,person_age_class,proximity,gaze_direction
0,11,100,263284,3.8,0,0,0,0,0,alone,multiple,none,,none
1,11,490,263284,3.8,0,0,0,0,0,alone,none,none,,none
2,11,1190,263284,3.8,0,0,1,0,1,social,none,adult,0.290025,1.0
3,11,1200,263284,3.8,0,0,1,0,1,social,other object,adult,0.301284,1.0
4,11,1370,263284,3.8,0,0,0,0,0,alone,other object,none,,none
5,11,1460,263284,3.8,0,0,0,0,0,alone,none,none,,none
6,11,1550,263284,3.8,0,0,0,0,0,alone,none,none,,none
7,11,2140,263284,3.8,1,0,1,0,1,social,none,adult,1.0,1.0
8,11,2150,263284,3.8,1,0,0,0,1,social,none,adult,,none
9,11,2270,263284,3.8,0,0,0,0,1,social,none,adult,,none


#### a) alone vs. social: kchi_present ~ age * context + (context|child_id) with context being a factor with levels „social“ and „alone“

In [32]:
utterance_01a_df = utterance_01_df_merged[['video_id', 'frame_number', 'child_id', 'kchi_present', 'age', 'play_context']]
utterance_01a_df.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/utterance_01a_df.csv', index=False)
len(utterance_01a_df), utterance_01a_df.head(10)

(5000,
    video_id  frame_number  child_id  kchi_present  age play_context
 0        11           100    263284             0  3.8        alone
 1        11           490    263284             0  3.8        alone
 2        11          1190    263284             0  3.8       social
 3        11          1200    263284             0  3.8       social
 4        11          1370    263284             0  3.8        alone
 5        11          1460    263284             0  3.8        alone
 6        11          1550    263284             0  3.8        alone
 7        11          2140    263284             1  3.8       social
 8        11          2150    263284             1  3.8       social
 9        11          2270    263284             0  3.8       social)

#### b) only alone data: kchi_present ~ age*object + (object | child_id) see if predictor object makes a difference, with object being a factor with levels „toy“, „book“, ...

In [34]:
utterance_01b_df = utterance_01_df_merged[utterance_01_df_merged['play_context'] == 'alone']
utterance_01b_df = utterance_01b_df[['video_id', 'frame_number', 'child_id', 'kchi_present', 'age', 'object_class']]
utterance_01b_df.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/utterance_01b_df.csv', index=False)
len(utterance_01b_df), utterance_01b_df.head(10)

(2197,
     video_id  frame_number  child_id  kchi_present  age  object_class
 0         11           100    263284             0  3.8      multiple
 1         11           490    263284             0  3.8          none
 4         11          1370    263284             0  3.8  other object
 5         11          1460    263284             0  3.8          none
 6         11          1550    263284             0  3.8          none
 11        11          3030    263284             0  3.8      multiple
 13        11          3080    263284             0  3.8           toy
 14        11          4110    263284             0  3.8          none
 16        11          4500    263284             0  3.8          none
 17        11          5120    263284             0  3.8           toy)

#### c) only social data: kchi_present ~ age * age_class * face * gaze + (age_class * face * gaze|child_id) see if type of social interaction makes a differences, that is the age class of the other person, whether there is a face and whether there is gaze

In [35]:
utterance_01c_df = utterance_01_df_merged[utterance_01_df_merged['play_context'] == 'social']
utterance_01c_df = utterance_01c_df[['video_id', 'frame_number', 'child_id', 'kchi_present', 'age', 'face_present', 'person_age_class', 'gaze_direction']]
utterance_01c_df.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/utterance_01c_df.csv', index=False)
len(utterance_01c_df), utterance_01c_df.head(10)

(2803,
     video_id  frame_number  child_id  kchi_present  age  face_present  \
 2         11          1190    263284             0  3.8             1   
 3         11          1200    263284             0  3.8             1   
 7         11          2140    263284             1  3.8             1   
 8         11          2150    263284             1  3.8             0   
 9         11          2270    263284             0  3.8             0   
 10        11          2640    263284             1  3.8             0   
 12        11          3070    263284             0  3.8             1   
 15        11          4370    263284             0  3.8             0   
 18        11          5450    263284             1  3.8             1   
 20        11          7030    263284             1  3.8             0   
 
    person_age_class gaze_direction  
 2             adult            1.0  
 3             adult            1.0  
 7             adult            1.0  
 8             adult     

## 2. How much speech is directed at the key child?

In [7]:

query_02 = """
WITH RandomSubjects AS (
    SELECT DISTINCT s.child_id, s.video_name, v.video_id, s.age_at_recording
    FROM Subjects s
    JOIN Videos v ON s.video_name = v.video_path
    ORDER BY RANDOM()
    LIMIT 40
),
-- Ziehe alle möglichen Frames aus den Videos der RandomSubjects
AllFrames AS (
    SELECT 
        rs.child_id, 
        rs.video_id, 
        rs.age_at_recording,
        f.frame_number
    FROM RandomSubjects rs
    JOIN (
        -- Hier alle möglichen Frames pro Video bestimmen
        SELECT video_id, frame_number
        FROM Detections_with_speaker
        GROUP BY video_id, frame_number
    ) f ON rs.video_id = f.video_id
),
-- Ziehe zufällig 5000 Frames
RandomFrames AS (
    SELECT *
    FROM AllFrames
    ORDER BY RANDOM()
    LIMIT 5000
)
SELECT 
    rf.video_id,
    rf.frame_number,
    rf.child_id,
    rf.age_at_recording AS age,
    d.proximity,
    COALESCE(d.gaze_direction, 'none') AS gaze_direction,
    COALESCE(CASE WHEN d.FEM = 1 OR d.CHI = 1 OR d.MAL = 1 THEN 1 ELSE 0 END, 0) AS cds_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 2 OR CAST(d.object_class AS INTEGER) = 3 THEN 1 ELSE 0 END, 0) AS face_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 0 THEN 1 ELSE 0 END, 0) AS child_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 1 THEN 1 ELSE 0 END, 0) AS adult_present,
    CASE 
        WHEN COALESCE(CAST(d.object_class AS INTEGER), -1) = 1 THEN 'adult'
        WHEN COALESCE(CAST(d.object_class AS INTEGER), -1) = 0 THEN 'child'
        ELSE 'none'
    END AS person_age_class
FROM RandomFrames rf
LEFT JOIN Detections_with_speaker d
    ON rf.video_id = d.video_id
   AND rf.frame_number = d.frame_number
ORDER BY rf.child_id, rf.video_id, rf.frame_number;
"""

def merge_duplicates_02(df):
    grouped = df.groupby(['video_id', 'frame_number', 'child_id', 'age'])
    merged_rows = []

    for name, group in grouped:
        merged_row = {
            'video_id': name[0],
            'frame_number': name[1],
            'child_id': name[2],
            'age': name[3],
            'cds_present': int(group['cds_present'].any()),
            'face_present': int(group['face_present'].any()),
            'child_present': int(group['child_present'].any()),
            'adult_present': int(group['adult_present'].any()),
        }

        # --- person_age_class logic ---
        pac_set = set(v for v in group['person_age_class'].unique() if v not in ('none', 'NaN'))
        if pac_set == {'child'}:
            merged_row['person_age_class'] = 'child'
        elif pac_set == {'adult'}:
            merged_row['person_age_class'] = 'adult'
        elif pac_set == {'child', 'adult'}:
            merged_row['person_age_class'] = 'both'
        else:
            merged_row['person_age_class'] = 'none'

        # --- proximity and gaze_direction: take max proximity and corresponding gaze ---
        proximities = pd.to_numeric(group['proximity'], errors='coerce').fillna(-1)
        if not proximities.empty and proximities.max() >= 0:
            max_prox_idx = proximities.idxmax()
            max_prox_value = proximities.loc[max_prox_idx]
            merged_row['proximity'] = max_prox_value
            gaze_value = group.loc[max_prox_idx, 'gaze_direction']
            if gaze_value in ('none', 'NaN', None):
                gaze_value = 'none'
            merged_row['gaze_direction'] = gaze_value
        else:
            merged_row['proximity'] = None
            merged_row['gaze_direction'] = 'none'

        merged_rows.append(merged_row)

    return pd.DataFrame(merged_rows)


#### cds_present ~ age * age_class * face * gaze * proximity + (age_class * face * gaze|child_id)


In [8]:
# Load data
with sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db') as conn:
    utterance_02_df = pd.read_sql_query(query_02, conn)

utterance_02_df_merged = merge_duplicates_02(utterance_02_df)
utterance_02_df_merged.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/utterance_02_df.csv', index=False)

utterance_02_df_merged.head(10)

Unnamed: 0,video_id,frame_number,child_id,age,cds_present,face_present,child_present,adult_present,person_age_class,proximity,gaze_direction
0,3,490,263190,3.73,1,0,0,0,none,,none
1,3,840,263190,3.73,0,0,0,0,none,,none
2,3,850,263190,3.73,0,0,0,0,none,,none
3,3,1060,263190,3.73,0,0,0,0,none,,none
4,3,1440,263190,3.73,0,0,0,0,none,,none
5,3,1710,263190,3.73,0,0,0,0,none,,none
6,3,1930,263190,3.73,0,0,0,0,none,,none
7,3,2000,263190,3.73,0,0,0,0,none,,none
8,3,2610,263190,3.73,0,0,0,0,none,,none
9,3,4110,263190,3.73,0,0,0,0,none,,none


## 3. Are children more frequently in the presence of adults compared to other children?


In [15]:

query_03 = """
WITH RandomSubjects AS (
    SELECT DISTINCT s.child_id, s.video_name, v.video_id, s.age_at_recording
    FROM Subjects s
    JOIN Videos v ON s.video_name = v.video_path
    ORDER BY RANDOM()
    LIMIT 40
),
-- Ziehe alle möglichen Frames aus den Videos der RandomSubjects
AllFrames AS (
    SELECT 
        rs.child_id, 
        rs.video_id, 
        rs.age_at_recording,
        f.frame_number
    FROM RandomSubjects rs
    JOIN (
        -- Hier alle möglichen Frames pro Video bestimmen
        SELECT video_id, frame_number
        FROM Detections_with_speaker
        GROUP BY video_id, frame_number
    ) f ON rs.video_id = f.video_id
),
-- Ziehe zufällig 5000 Frames
RandomFrames AS (
    SELECT *
    FROM AllFrames
    ORDER BY RANDOM()
    LIMIT 5000
)
SELECT 
    rf.video_id,
    rf.frame_number,
    rf.child_id,
    rf.age_at_recording AS age,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 0 OR CAST(d.object_class AS INTEGER) = 2 THEN 1 ELSE 0 END, 0) AS child_present,
    COALESCE(CASE WHEN CAST(d.object_class AS INTEGER) = 1 OR CAST(d.object_class AS INTEGER) = 3 THEN 1 ELSE 0 END, 0) AS adult_present
FROM RandomFrames rf
LEFT JOIN Detections_with_speaker d
    ON rf.video_id = d.video_id
   AND rf.frame_number = d.frame_number
WHERE d.object_class IN ('0', '1', '2', '3')  -- Only include child and adult classes
ORDER BY rf.child_id, rf.video_id, rf.frame_number;
"""

def merge_duplicates_03(df):
    grouped = df.groupby(['video_id', 'frame_number', 'child_id', 'age'])
    merged_rows = []

    for name, group in grouped:
        child_present = group['child_present'].max()
        adult_present = group['adult_present'].max()

    for name, group in grouped:
        merged_row = {
            'video_id': name[0],
            'frame_number': name[1],
            'child_id': name[2],
            'age': name[3],
            'child_present': child_present,
            'adult_present': adult_present
        }

        # generate person_age_class from child_present and adult_present
        if child_present == 1 and adult_present == 1:
            merged_row['person_age_class'] = 'both'
        elif child_present == 1:
            merged_row['person_age_class'] = 'child'
        elif adult_present == 1:
            merged_row['person_age_class'] = 'adult'
        else:
            merged_row['person_age_class'] = 'none'

        merged_rows.append(merged_row)

    return pd.DataFrame(merged_rows)


In [16]:
# Load data
with sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db') as conn:
    person_presence_03_df = pd.read_sql_query(query_03, conn)

person_presence_03_df_merged = merge_duplicates_03(person_presence_03_df)
person_presence_03_df_merged.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/person_presence_03_df.csv', index=False)

len(person_presence_03_df_merged), person_presence_03_df_merged.head(10)

(2882,
    video_id  frame_number  child_id   age  child_present  adult_present  \
 0        15           270    264041  3.52              0              1   
 1        15           550    264041  3.52              0              1   
 2        15           990    264041  3.52              0              1   
 3        15          1200    264041  3.52              0              1   
 4        15          2070    264041  3.52              0              1   
 5        15          2650    264041  3.52              0              1   
 6        15          3600    264041  3.52              0              1   
 7        15          3900    264041  3.52              0              1   
 8        15          3980    264041  3.52              0              1   
 9        15          4150    264041  3.52              0              1   
 
   person_age_class  
 0            adult  
 1            adult  
 2            adult  
 3            adult  
 4            adult  
 5            adult  
 

## 4. How does the composition of social interactions change with age?


#### model <- interaction ~ age + (1|child_id)

In [None]:
query_04 = """
WITH RandomSubjects AS (
    SELECT DISTINCT s.child_id, s.video_name, v.video_id, s.age_at_recording
    FROM Subjects s
    JOIN Videos v ON s.video_name = v.video_path
    ORDER BY RANDOM()
    LIMIT 40
),
-- Ziehe alle möglichen Frames aus den Videos der RandomSubjects
AllFrames AS (
    SELECT 
        rs.child_id, 
        rs.video_id, 
        rs.age_at_recording,
        f.frame_number
    FROM RandomSubjects rs
    JOIN (
        -- Hier alle möglichen Frames pro Video bestimmen
        SELECT video_id, frame_number
        FROM Detections_with_speaker
        GROUP BY video_id, frame_number
    ) f ON rs.video_id = f.video_id
),
-- Ziehe zufällig 5000 Frames
RandomFrames AS (
    SELECT *
    FROM AllFrames
    ORDER BY RANDOM()
    LIMIT 5000
)
SELECT 
    rf.video_id,
    rf.frame_number,
    rf.child_id,
    rf.age_at_recording AS age,
    CASE 
        WHEN 
            COALESCE(CAST(d.object_class AS INTEGER), -1) IN (0, 1, 2, 3) -- person/face
            AND ( -- check for any speech type
                COALESCE(CAST(d.KCHI AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.FEM AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.MAL AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.CHI AS INTEGER), -1) = 1
            )
            THEN 'multimodal'
        WHEN
            COALESCE(CAST(d.object_class AS INTEGER), -1) NOT IN (0, 1, 2, 3) -- no person/face
            AND ( -- check for any speech type
                COALESCE(CAST(d.KCHI AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.FEM AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.MAL AS INTEGER), -1) = 1
                OR COALESCE(CAST(d.CHI AS INTEGER), -1) = 1
            )
            THEN 'speech-only'
        WHEN
            COALESCE(CAST(d.object_class AS INTEGER), -1) IN (0, 1, 2, 3) -- person/face
            AND COALESCE(CAST(d.KCHI AS INTEGER), -1) = 0 -- no KCHI
            AND COALESCE(CAST(d.FEM AS INTEGER), -1) = 0 -- no FEM
            AND COALESCE(CAST(d.MAL AS INTEGER), -1) = 0 -- no MAL
            AND COALESCE(CAST(d.CHI AS INTEGER), -1) = 0 -- no CHI
            THEN 'person-only'
        ELSE 'none'
    END AS interaction_type
FROM RandomFrames rf
LEFT JOIN Detections_with_speaker d
    ON rf.video_id = d.video_id
   AND rf.frame_number = d.frame_number -- Removed comma from here
WHERE interaction_type IS NOT 'none'
ORDER BY rf.child_id, rf.video_id, rf.frame_number;
"""

def merge_duplicates_04(df):
    grouped = df.groupby(['video_id', 'frame_number', 'child_id', 'age'])
    merged_rows = []

    for name, group in grouped:
        merged_row = {
            'video_id': name[0],
            'frame_number': name[1],
            'child_id': name[2],
            'age': name[3],
        }

        interaction_types_set = set(group['interaction_type'].unique())
        
        is_multimodal_present = 'multimodal' in interaction_types_set
        is_speech_only_present = 'speech-only' in interaction_types_set
        is_person_only_present = 'person-only' in interaction_types_set

        if is_multimodal_present or (is_speech_only_present and is_person_only_present):
            merged_row['interaction_type'] = 'multimodal'
        elif is_speech_only_present:
            merged_row['interaction_type'] = 'speech-only'
        elif is_person_only_present:
            merged_row['interaction_type'] = 'person-only'
        else:
            merged_row['interaction_type'] = 'none'

        merged_rows.append(merged_row)

    return pd.DataFrame(merged_rows)


In [20]:
# Load data
with sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db') as conn:
    interactions_04_df = pd.read_sql_query(query_04, conn)

interactions_04_df_merged = merge_duplicates_04(interactions_04_df)
interactions_04_df_merged.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/interactions_04_df.csv', index=False)

len(interactions_04_df_merged), interactions_04_df_merged.head(10)

(3428,
    video_id  frame_number  child_id   age interaction_type
 0         2            30    264089  3.69      person-only
 1         2            40    264089  3.69       multimodal
 2         2           290    264089  3.69       multimodal
 3         2          1310    264089  3.69      person-only
 4         2          1600    264089  3.69      person-only
 5         2          1670    264089  3.69      person-only
 6         2          1940    264089  3.69      person-only
 7         2          2020    264089  3.69      person-only
 8         2          2530    264089  3.69      person-only
 9         2          2780    264089  3.69      person-only)

## 5. How does the frequency of toy use differ between solo and social play contexts?

#### Model: toy_present ~ age * toy_class * social_context + (toy_class * social_context | child_id)

In [27]:
query_05 = """
WITH RandomSubjects AS (
    -- Select 10 random subjects
    SELECT DISTINCT s.child_id, s.video_name, v.video_id, s.age_at_recording
    FROM Subjects s
    JOIN Videos v ON s.video_name = v.video_path
    ORDER BY RANDOM()
    LIMIT 10
),
RandomFrames AS (
    -- Select 5000 unique random frames TOTAL from the videos of the selected subjects.
    SELECT DISTINCT d.frame_number, d.video_id -- DISTINCT here ensures we get unique frames before LIMIT
    FROM Detections d -- This CTE still uses Detections for initial frame selection.
    JOIN RandomSubjects rs ON d.video_id = rs.video_id
    ORDER BY RANDOM()
    LIMIT 5000
),
CombinedSocialPresence AS (
    -- Determine if adult or child are present based on EITHER speech OR object detection
    SELECT 
        rf.frame_number,
        rf.video_id,
        -- Adult presence: adult speech OR adult object/face
        MAX(
            CASE 
                WHEN COALESCE(d_spk.FEM, 0) = 1 THEN 1
                WHEN COALESCE(d_spk.MAL, 0) = 1 THEN 1
                WHEN d_obj_ctx.object_class = '1' THEN 1 -- Adult person object
                WHEN d_obj_ctx.object_class = '3' THEN 1 -- Adult face object
                ELSE 0 
            END
        ) as is_adult_present_any_mode,
        -- Child presence: child speech OR child object/face
        MAX(
            CASE 
                WHEN COALESCE(d_spk.CHI, 0) = 1 THEN 1
                WHEN d_obj_ctx.object_class = '0' THEN 1 -- Child person object
                WHEN d_obj_ctx.object_class = '2' THEN 1 -- Child face object
                ELSE 0 
            END
        ) as is_child_present_any_mode
    FROM RandomFrames rf
    LEFT JOIN Detections_with_speaker d_spk 
        ON rf.frame_number = d_spk.frame_number AND rf.video_id = d_spk.video_id
    LEFT JOIN Detections d_obj_ctx -- For object-based context detection
        ON rf.frame_number = d_obj_ctx.frame_number AND rf.video_id = d_obj_ctx.video_id
           AND d_obj_ctx.object_class IN ('0', '1', '2', '3') -- Filter for person/face objects for context
    GROUP BY rf.frame_number, rf.video_id
),
FinalSocialContext AS (
    -- Assign social_context_val based on combined presence flags
    SELECT
        csp.frame_number,
        csp.video_id,
        CASE
            WHEN csp.is_adult_present_any_mode = 1 AND csp.is_child_present_any_mode = 1 THEN 'with adult and child'
            WHEN csp.is_adult_present_any_mode = 1 THEN 'with adult'
            WHEN csp.is_child_present_any_mode = 1 THEN 'with child'
            ELSE 'alone'
        END as social_context_val
    FROM CombinedSocialPresence csp
),
FrameToyInfo AS (
    -- Determine the toy_class and toy_present for each frame
    SELECT
        rf.frame_number,
        rf.video_id,
        CASE
            WHEN COUNT(DISTINCT d_obj.object_class) = 1 THEN MAX(d_obj.object_class) -- If only one distinct toy type (5-10) is present
            WHEN COUNT(DISTINCT d_obj.object_class) > 1 THEN 'multiple'       -- If multiple distinct toy types (5-10) are present
            ELSE 'none'                                                  -- If no toy types (5-10) are present
        END as determined_toy_class,
        CASE
            WHEN COUNT(DISTINCT d_obj.object_class) > 0 THEN 1 -- If any toy object ('5'-'10') is present
            ELSE 0
        END as is_toy_present
    FROM RandomFrames rf
    LEFT JOIN Detections d_obj -- Using Detections table for object classes (toys)
        ON rf.frame_number = d_obj.frame_number
        AND rf.video_id = d_obj.video_id
        AND d_obj.object_class IN ('5', '6', '7', '8', '9', '10') -- Only consider these specific object classes for toys
    GROUP BY rf.frame_number, rf.video_id
)
-- Final selection of columns for the model
SELECT
    rf.video_id,
    rf.frame_number AS frame_id,
    rs.child_id,
    rs.age_at_recording AS age,
    fti.determined_toy_class AS toy_class, 
    fsc.social_context_val AS social_context,
    fti.is_toy_present AS toy_present -- Added toy_present column
FROM 
    RandomFrames rf
    JOIN RandomSubjects rs ON rf.video_id = rs.video_id
    LEFT JOIN FinalSocialContext fsc ON rf.frame_number = fsc.frame_number AND rf.video_id = fsc.video_id
    LEFT JOIN FrameToyInfo fti ON rf.frame_number = fti.frame_number AND rf.video_id = fti.video_id
ORDER BY 
    rs.child_id, rf.video_id, rf.frame_number;
"""

In [None]:
# Load data
with sqlite3.connect('/home/nele_pauline_suffo/outputs/detection_pipeline_results/detection_results.db') as conn:
    toys_05_df = pd.read_sql_query(query_05, conn)

toys_05_df.to_csv('/home/nele_pauline_suffo/outputs/detection_pipeline_results/toys_05_df.csv', index=False)

len(toys_05_df), toys_05_df.head(10)