### This notebook performs calculating statistical values on the features extracted from the participant responses of SuperBAD

In [1]:
import re
import os
import time
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

This block reads the feature dataset of an individual response of a participant, and then retains the relevant columns (features) on which the statistical values are to be calculated

In [2]:
# Read a random feature dataset of a participant
participant_df = pd.read_csv('./csv_features_clean/1499/ch1_1.csv')

# Ignore the columns that are not required
cols_to_exclude = ['frame', 'face_id', 'timestamp']
# Specify the columns that will be introduced in the statistical dataset
feature_columns = ['participant_id', 'class', 'video']

# Rename the columns selected from the feature dataset to specify what statistic is being calculated on that feature
# i.e: average (_avg), standard deviation (_std)
for col in participant_df.columns:
    if col in cols_to_exclude:
        continue
    feature_columns.append(f'{col}_avg')
    feature_columns.append(f'{col}_std')

feature_columns

['participant_id',
 'class',
 'video',
 'confidence_avg',
 'confidence_std',
 'success_avg',
 'success_std',
 'gaze_0_x_avg',
 'gaze_0_x_std',
 'gaze_0_y_avg',
 'gaze_0_y_std',
 'gaze_0_z_avg',
 'gaze_0_z_std',
 'gaze_1_x_avg',
 'gaze_1_x_std',
 'gaze_1_y_avg',
 'gaze_1_y_std',
 'gaze_1_z_avg',
 'gaze_1_z_std',
 'gaze_angle_x_avg',
 'gaze_angle_x_std',
 'gaze_angle_y_avg',
 'gaze_angle_y_std',
 'eye_lmk_x_0_avg',
 'eye_lmk_x_0_std',
 'eye_lmk_x_1_avg',
 'eye_lmk_x_1_std',
 'eye_lmk_x_2_avg',
 'eye_lmk_x_2_std',
 'eye_lmk_x_3_avg',
 'eye_lmk_x_3_std',
 'eye_lmk_x_4_avg',
 'eye_lmk_x_4_std',
 'eye_lmk_x_5_avg',
 'eye_lmk_x_5_std',
 'eye_lmk_x_6_avg',
 'eye_lmk_x_6_std',
 'eye_lmk_x_7_avg',
 'eye_lmk_x_7_std',
 'eye_lmk_x_8_avg',
 'eye_lmk_x_8_std',
 'eye_lmk_x_9_avg',
 'eye_lmk_x_9_std',
 'eye_lmk_x_10_avg',
 'eye_lmk_x_10_std',
 'eye_lmk_x_11_avg',
 'eye_lmk_x_11_std',
 'eye_lmk_x_12_avg',
 'eye_lmk_x_12_std',
 'eye_lmk_x_13_avg',
 'eye_lmk_x_13_std',
 'eye_lmk_x_14_avg',
 'eye_lmk_x_14

In [3]:
# Visualize the statistical dataset
stats_df = pd.DataFrame(columns = feature_columns)
stats_df

Unnamed: 0,participant_id,class,video,confidence_avg,confidence_std,success_avg,success_std,gaze_0_x_avg,gaze_0_x_std,gaze_0_y_avg,...,AU23_c_avg,AU23_c_std,AU25_c_avg,AU25_c_std,AU26_c_avg,AU26_c_std,AU28_c_avg,AU28_c_std,AU45_c_avg,AU45_c_std


The below code ignores redundant files and identifies a list of participant files and featureResponse files
<br>
It iterates over each *participant* and all of their extracted features datasets (i.e: ch1.csv, ch2.csv, .., cr1.csv, cr2.csv, .., fr1.csv, fr2.csv, .., fh1.csv, fh2.csv, ..) and for each of the dataset, it calculates mean and standard deviation (statistics) for each of the feature columns and then stores it in the *stats_df*

In [4]:
start_time = time.time()

files_to_exclude = ['.DS_Store']

featurePath = './csv_features_clean'
participants = sorted([file for file in os.listdir(featurePath) if file not in files_to_exclude])

with tqdm(total=len(participants)) as pbar:
    for participant in participants:
        participant_directory = f'{featurePath}/{participant}'

        responses = sorted([file for file in os.listdir(participant_directory) if file not in files_to_exclude], key=lambda x: x.split('_')[0])

        for response in responses:
            response_df = pd.read_csv(f'{participant_directory}/{response}')

            # Dictionary to store column statistics
            col_stats = {}

            for col in response_df.iloc[:, 3:]:
                col_values = response_df[col].values
                col_avg = col_values.mean()
                col_std = col_values.std()
                col_stats['participant_id'] = participant
                col_stats['class'] = response[:2]
                col_stats['video'] = os.path.splitext(response)[0]
                col_stats[f'{col}_avg'] = col_avg
                col_stats[f'{col}_std'] = col_std

            # Concatenate col_stats as a row to stats_df    
            stats_df = pd.concat([stats_df, pd.DataFrame([col_stats])])
        pbar.update(1)
        pbar.set_description(f'Participant ID = {participant}')

end_time = time.time()
total_time = end_time - start_time
print(f'Total Time = {total_time} seconds')

Participant ID = 9214: 100%|██████████████████████| 29/29 [00:37<00:00,  1.30s/it]

Total Time = 37.663419008255005 seconds





In [5]:
stats_df.to_csv('./allParticipant_feature_stats.csv', index=False)