# Pooling Touch Annotations
#### from https://github.com/psychoinformatics-de/studyforrest-paper-bodycontactannotation/tree/master/data



Align annotations into 2 second timesteps, if there are multiple in the same timestep, keep both to be processed later


In [1]:
import pandas as pd
import glob
import numpy as np
import math 

file_paths = glob.glob('/Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/raw/*.csv')

all_characters = set([])
def expand_to_timesteps(df, max_time):
    '''Expand the entire dataframe into 2-second timesteps, creating separate rows for overlapping labels.'''
    timesteps = np.arange(0, max_time + 2.0, 2.0)  
    expanded_rows = []

    for ts in timesteps:
        active_rows = df[(df['start'].apply(math.floor) <= ts) & (df['end'].apply(math.floor) >= ts)]

        if not active_rows.empty:
            for _, row in active_rows.iterrows():
                all_characters.update({row['actor']})
                all_characters.update({row['recipient']})

                expanded_rows.append({
                    'timestep': ts,
                    'actor': row['actor'],
                    'recipient': row['recipient'],
                    'bodypart_actor': row['bodypart_actor'],
                    'bodypart_recipient': row['bodypart_recipient'],
                    'label': row['label'],
                    'intensity_of_body_contact': row['intensity_of_body_contact'],
                    'valence_actor': row['valence_actor'],
                    'valence_recipient': row['valence_recipient'],
                    'intention': row['intention'],
                    'audio_information': row['audio_information']
                })
        else:
            # No active row for this timestep, fill with NONE
            expanded_rows.append({
                'timestep': ts,
                'actor': None,
                'recipient': None,
                'bodypart_actor': None,
                'bodypart_recipient': None,
                'label': 'NONE',
                'intensity_of_body_contact': None,
                'valence_actor': None,
                'valence_recipient': None,
                'intention': None,
                'audio_information': None
            })

    return expanded_rows

expanded_file_paths = []
for file_path in file_paths:
    df = pd.read_csv(file_path)

    max_time = df['end'].max()

    expanded_data = expand_to_timesteps(df, max_time)

    expanded_df = pd.DataFrame(expanded_data)
    output_path = file_path.replace('.csv', '_expanded.tsv').replace('/raw', '')
    expanded_file_paths.append(output_path)
    expanded_df.to_csv(output_path, index=False, sep='\t')

    print(f'Processed and saved: {output_path}')

print(all_characters)




Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs5_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs4_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs6_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs7_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs3_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs2_expanded.tsv
Processed and saved: /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs1_expanded.tsv
{'DAN', 'WOMAN', 'CROWD', 'BUBBA', 'MRS_GUMP', nan, 'OLDMAN', 'OLDWOMAN', 'FORREST', 'MEN', 'BOY', 'CHILDREN', 'JENNY'}


In [3]:
import pandas as pd
import glob

file_paths = glob.glob('/Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/*_expanded.tsv')

dataframes = [pd.read_csv(file, sep='\t') for file in expanded_file_paths]
combined_df = pd.concat(dataframes, ignore_index=True)

max_ts = combined_df['timestep'].max()
timesteps = np.arange(0, max_ts, 2.0)
count = 0

main_characters = ['FORREST', 'JENNY', 'DAN', 'BUBBA', 'MRS_GUMP']  
popular_rows = []

def character_priority(character):
    '''Returns the priority index of a character in the main_characters list.'''
    if character in main_characters:
        return main_characters.index(character)
    return float('inf')  # Characters not in the list get the lowest priority

def has_main_character(group):
    '''Returns the priority of the actor-recipient pair based on main characters.'''
    actor = group['actor'].iloc[0]
    recipient = group['recipient'].iloc[0]
    
    actor_priority = character_priority(actor)
    recipient_priority = character_priority(recipient)
    
    return min(actor_priority, recipient_priority)  # Return the lower priority value (higher priority)
    

def bodypart_occurrences(df, col, num=15):
    '''returns top num occurring body parts'''
    bodyparts = dict()
    for _, row in df.iterrows():
        if not pd.isna(row[col]):
            parts = row[col].split()
            for part in parts:
                bodyparts[part] = bodyparts.get(part, 0) + 1

    return ' '.join(list(dict(sorted(bodyparts.items(), key=lambda item: item[1], reverse=True)).keys())[:num])


def valence_avg(df, col):
    '''Determine average valence'''
    scale = {
        'STRONG_NEGATIVE': 0,
        'NEGATIVE': 1,
        'POSITIVE': 2,
        'STRONG_POSITIVE': 3,
    }
    sum = 0
    count = 0

    for _, row in df.iterrows():
        if not pd.isna(row[col]):
            sum += scale[row[col]]
            count += 1
             
    return list(scale.keys())[round(sum / count)] if count != 0 else None


def get_max_df(duplicates, ts_rows):
    '''Determine the most popular actor-recipient pair, with main character prioritization.'''
    
    if duplicates:
        # sort duplicates by priority of main characters
        sorted_duplicates = sorted(duplicates, key=lambda x: min(character_priority(x[1]['actor'].iloc[0]), character_priority(x[1]['recipient'].iloc[0])))
        
        max_df = max(sorted_duplicates, key=lambda x: len(x[1]))[1]

        # If max label is 'NONE', prioritize annotations including main characters.
        if max_df['label'].iloc[0] == 'NONE':
            main_character_rows = ts_rows[ts_rows['actor'].isin(main_characters) | ts_rows['recipient'].isin(main_characters)]
            if not main_character_rows.empty:
                duplicates = list(main_character_rows.groupby(['actor', 'recipient'], dropna=False))
                sorted_duplicates = sorted(duplicates, key=lambda x: min(character_priority(x[1]['actor'].iloc[0]), character_priority(x[1]['recipient'].iloc[0])))
                max_df = max(sorted_duplicates, key=lambda x: len(x[1]))[1]
                
        return max_df
    return None


# take popular vote, if there is a tie or NONE, give priority to main characters
for ts in timesteps:
    ts_rows = combined_df[combined_df['timestep'] == ts]
    duplicates = list(ts_rows.groupby(['actor', 'recipient'], dropna=False))

    # Get the max_df based on actor-recipient pair, with main character priority
    max_df = get_max_df(duplicates, ts_rows)

    if max_df is not None:
        
        # get top 3 body parts for actor and recipient
        bodypart_actor = bodypart_occurrences(max_df, 'bodypart_actor', 3)
        bodypart_recipient = bodypart_occurrences(max_df, 'bodypart_recipient', 3)

        # take average of valences
        valence_actor = valence_avg(max_df, 'valence_actor')
        valence_recipient = valence_avg(max_df, 'valence_recipient')
        
        # all other columns take mode
        popular_rows.append({
            'timestep': ts,
            'actor': max_df['actor'].iloc[0],
            'recipient': max_df['recipient'].iloc[0],
            'bodypart_actor': bodypart_actor,
            'bodypart_recipient': bodypart_recipient,
            'label': max_df['label'].mode().iloc[0] if not max_df['label'].mode().empty else None,
            'intensity_of_body_contact': max_df['intensity_of_body_contact'].mode().iloc[0] if not max_df['intensity_of_body_contact'].mode().empty else None,
            'valence_actor': valence_actor,
            'valence_recipient': valence_avg(max_df, 'valence_recipient'),
            'intention': max_df['intention'].mode().iloc[0] if not max_df['intention'].mode().empty else None,
            'audio_information': max_df['audio_information'].mode().iloc[0] if not max_df['audio_information'].mode().empty else None
        })
    else:
        popular_rows.append({
            'timestep': ts,
            'actor': None,
            'recipient': None,
            'bodypart_actor': None,
            'bodypart_recipient': None,
            'label': 'NONE',
            'intensity_of_body_contact': None,
            'valence_actor': None,
            'valence_recipient': None,
            'intention': None,
            'audio_information': None
        })

    
# Convert the list of dictionaries to a DataFrame
popular_df = pd.DataFrame(popular_rows)

# Save to TSV file
output_path = '/Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs_pop_vote.tsv'
popular_df.to_csv(output_path, index=False, sep='\t')

print(f'Saved popular vote results to {output_path}')


Saved popular vote results to /Users/lucaschoi/Documents/GitHub/DeepEmotion/data/touch/obs_pop_vote.tsv
