### preprocessing  
For each {foldername} in ..\\data\\{train, test, val}, it does preprocess on \\{foldername}\\openface_asd_pairs.csv  
and saves the result into \\{foldername}\\preprocessed\\

In [1]:
import pickle, os
import pandas as pd
import numpy as np
from collections import Counter, OrderedDict
from ast import literal_eval
import json
from pathlib import Path
import shutil

In [2]:
folder = [Path("..\\data\\train"),Path("..\\data\\test"), Path("..\\data\\val")]
subfolder=[]
for i in folder:
    temp=[i / f for f in os.listdir(i) if not f.startswith('.') and os.path.isdir(i / f)]
    subfolder = subfolder + temp

# # if test on a single file
# subfolder=subfolder[0:1]

print(subfolder)

[WindowsPath('../data/train/train_vimeo_562725349'), WindowsPath('../data/train/train_vimeo_611083383'), WindowsPath('../data/test/test_vimeo_414944373'), WindowsPath('../data/val/val_vimeo_429364395')]


In [3]:
# Hyper-parameters for preprocessing

# step 1 (smoothing), remove short silence intervals that are < PP1_th frames
PP1 = True
PP1_th = 25

# step 2 (smoothing), remove short utterance intervals that are < PP2_th frames
PP2 = True
PP2_th = 25

# step 3 (data sample determine)
# for a speaker start speaking at time t, the data sample x is determined by [t-LEN_th_max, t-1-LEN_exclude], and t>=LEN_th_min
PP3 = True 
LEN_th_max = 10*25
LEN_th_min = 25
LEN_exclude = 'silence' # int or 'silence', skip a length just before the next speaker's utterance to prevent the mouth-openning frame affecting the classifier

# step 4 (feature selection)
PP4 = True
InterpolateNaN = True # interpolation NaNs in the data samples. if 'False', rows including NaNs will be removed.
#### feature definition start ####
gaze_direction_3d = ['gaze_0_x', 'gaze_0_y', 'gaze_0_z', 'gaze_1_x', 'gaze_1_y', 'gaze_1_z']
gaze_direction_2d = ['gaze_angle_x','gaze_angle_y']
eye_landmarks_2d = [f'eye_lmk_x_{i}' for i in range(0, 56)] + [f'eye_lmk_y_{j}' for j in range(0, 56)]
eye_landmards_3d = [f'eye_lmk_X_{i}' for i in range(0, 56)] + [f'eye_lmk_Y_{j}' for j in range(0, 56)]
head_position = ['pose_Tx', 'pose_Ty', 'pose_Tz', 'pose_Rx', 'pose_Ry', 'pose_Rz']
face_landmark_2d = [f'x_{i}' for i in range(0, 68)] + [f'y_{j}' for j in range(0, 68)]
face_landmark_3d = [f'X_{i}' for i in range(0, 68)] + [f'Y_{j}' for j in range(0, 68)] + [f'Z_{k}' for k in range(0, 68)]
PDM_rigid = ['p_scale', 'p_rx', 'p_ry', 'p_rz', 'p_tx', 'p_ty']
PDM_nonrigid = [f'p_{i}' for i in range(0, 34)]
FAU_intensity = ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r',
                 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r']
FAU_presence = ['AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c',
                'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_r']
info_col = ['frame','bbox_xmin','bbox_ymin','bbox_xmax','bbox_ymax'] # frame_id and bbox_ASD's position
#### feature definition end ####
output_col = gaze_direction_2d + FAU_intensity # this line selects features


Load openface_asd_pairs.csv and create turn.csv [frame, [speaker_id], [listener_id]]

In [4]:
input_file = 'openface_asd_pairs.csv'
output_file = "turn.csv"
output_folder = "preprocessed"

for i in subfolder:
    df = pd.read_csv(i / input_file)
    # select rows of the same frame
    # define turn_speaker = [track_id1, track_id2, ...] for all the track_id whose score>0, or =[] if no track_id has score>0
    # make ["frame" "speaker" "listener"] as a new dataframe and save to .csv
    frame_id, speaker, listener = [],[],[]
    frame_id = list(df['frame'].unique().astype(int))
    spk,lsn = [],[]
    for j in range(len(df)):
        if j>0 and df['frame'].iloc[j]!=df['frame'].iloc[j-1]: # whenever a new frame is coming
            speaker.append(spk)
            listener.append(lsn)
            spk,lsn = [],[]
        if not np.isnan(df['track_id'].iloc[j]):
            if df['ASDscore'].iloc[j]>0:
                spk.append(df['track_id'].iloc[j].astype(int))
            else:
                lsn.append(df['track_id'].iloc[j].astype(int))
        if j==len(df)-1: # the final frame is coming
            speaker.append(spk)
            listener.append(lsn)
            
    folder_path = i / output_folder
    Path(folder_path).mkdir(exist_ok=True)
    pd.DataFrame({'frame':frame_id, 'speaker':speaker, 'listener':listener}).to_csv(folder_path / output_file, index=False)               

Remove short silence intervals  

In [5]:
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = [value]
    elif type(dictionary[key]) == list:
        dictionary[key].append(value)
    else:
        dictionary[key] = [dictionary[key], value]

def find_utternce_or_silence(speaker, frame_id, mode): #mode='utterance or silence'
    d_right, d_left={},{}
    for j in range(1, len(frame_id)):
        right = list(set(speaker[j-1])-set(speaker[j])) # the speaker exists in previous frame but not exists in current frame
        for item in right:
            add2dict(d_right, key=item, value=j-1)
        left = list(set(speaker[j])-set(speaker[j-1]))
        for item in left:
            add2dict(d_left, key=item, value=j)
        # d_left is {speaker_id:[left endpoint]}, d_right is {speaker_id:[right endpoint]}

    d={}
    if mode=='utterance': # return {speaker_id:[left_endpoint, right_endpoint]}, which means utterance interval
        for key in d_right:
            if key in d_left:
                if d_left[key][0]<=d_right[key][0]:
                    d[key] = list(zip(d_left[key], d_right[key]))
                elif len(d_right[key])>1 and d_left[key][0]<=d_right[key][1]:
                    d[key] = list(zip(d_left[key], d_right[key][1:]))
    elif mode=='silence': # return {speaker_id:[right_endpoint, next_left_endpoint]}, which means silence interval
        for key in d_right:
            if key in d_left:
                if d_right[key][0]<d_left[key][0]:
                    d[key] = list(zip(d_right[key], d_left[key]))
                elif len(d_left[key])>1 and d_right[key][0]<d_left[key][1]:
                    d[key] = list(zip(d_right[key], d_left[key][1:]))
    else:
        print("mode is not correct, please check!")
    return d

In [6]:
input_file = "turn.csv"
output_file = "turn_PP1.csv"
        
if PP1:
    for i in subfolder:

        df=pd.read_csv(i / output_folder / input_file)
        speaker = df['speaker'].apply(lambda x: literal_eval(str(x))).tolist()
        listener = df['listener'].apply(lambda x: literal_eval(str(x))).tolist()
        frame_id = df['frame'].tolist()

        d=find_utternce_or_silence(speaker, frame_id, mode='silence')
        # now d is {speaker_id:[(right-endpoint, left-endpoint),(),...]}
        # we just need to check the length of each right-left, which is the silence interval, and fill the short ones
        for key in d:
            for right, left in d[key]:
                assert right<left
                if frame_id[left]-frame_id[right]-1 < PP1_th:
                    for j in range(right+1, left):
                        speaker[j]+=[key]
                        if key in listener[j]:
                            listener[j].remove(key) 
        df = pd.DataFrame({'frame':frame_id, 'speaker':speaker, 'listener':listener})
        df.to_csv(i / output_folder / output_file, index=False)

Remove short utterance intervals

In [7]:
input_file = "turn_PP1.csv"
output_file = "turn_PP2.csv"

if PP2:
    for i in subfolder:

        df=pd.read_csv(i / output_folder / input_file)
        speaker = df['speaker'].apply(lambda x: literal_eval(str(x))).tolist()
        listener = df['listener'].apply(lambda x: literal_eval(str(x))).tolist()
        frame_id = df['frame'].tolist()

        d=find_utternce_or_silence(speaker, frame_id, mode='utterance')
        # now d is {speaker_id:[(left-endpoint, right-endpoint),(),...]}
        # we just need to check the length of each right-left, which is the utterance interval, and remove the short ones
        for key in d:
            for left, right in d[key]:
                assert left<=right                    
                if frame_id[right]+1-frame_id[left] < PP2_th:
                    for j in range(left, right+1):
                        if key in speaker[j]:
                            speaker[j].remove(key)
                        listener[j]+=[key]
        df = pd.DataFrame({'frame':frame_id, 'speaker':speaker, 'listener':listener})
        df.to_csv(i / output_folder / output_file, index=False)

get training sample information (x_id, x_frame, y_id)

In [8]:
input_file = "turn_PP2.csv"
output_file = "turn.json"
# output (x_id, x_start, x_end, y_id, y_start, y_end), which is necessary information for a training sample

if PP3:
    for i in subfolder:   # find the position of x,y
        df=pd.read_csv(i / output_folder / input_file)
        speaker = df['speaker'].apply(lambda x: literal_eval(str(x))).tolist()
        listener = df['listener'].apply(lambda x: literal_eval(str(x))).tolist()
        frame_id = df['frame'].tolist()
#         d = dict(zip(frame_id, zip(speaker, listener))    
#         df = pd.DataFrame(speaker, index=frame_id, columns =['speaker_id'])
        
        d=find_utternce_or_silence(speaker, frame_id, mode='utterance')
        # now d is {speaker_id:[(left-endpoint, right-endpoint),(),...]}
        # we can assign y_id=speaker_id, y_start=left_endpoint, y_end=right_endpoint, 
        # x_start=left_endpoint-LEN_th, x_end=left_endpoint-1, x_id=listener in [x_start:x_end+1],
        training_sample=[]
        for key in d:
            for left, right in d[key]:
                y_id = key
                y_start = left # this is index, not frame number, the same below. 
                y_end = right
                x_start = left-LEN_th_max if left-LEN_th_max >= 0 else 0
                x_end = left-1 if left-1 >= 0 else 0
                
                if LEN_exclude == 'silence':
                    while x_end > x_start and speaker[x_end]==[]:
                        x_end -= 1
                else:
                    x_end = max(x_start, x_end - LEN_exclude)
                    
                x_id = list(set().union(*listener[x_start:x_end+1]))
                for x in x_id:
                    x_mask = [x in j for j in listener[x_start:x_end+1]]*np.arange(x_start,x_end+1)
                    x_frame = x_mask[x_mask!=0]
                    x_frame = np.array(frame_id)[x_frame] # convert to frame number
                    if len(x_frame) >= LEN_th_min:
                        training_sample.append({"x_id":x, "x_frame":list(map(int, x_frame)), "y_id":y_id})
                 
        with open(i / output_folder / output_file,'w') as fp:
            json.dump(training_sample, fp) 

Do row (frame) selection and column (feature) selection, and output data samples

In [9]:
input_file = "turn.json"
output_file = "training_sample_detail.json"
asd_file = "openface_asd_pairs.csv"
data_folder = "data_sample"
# output (x_id, x_start, x_end, y_id, y_start, y_end), which is necessary information for a training sample

if PP4:
    for i in subfolder:
        print("start processing" + str(i))
        stat=[]

        asd = pd.read_csv(i / asd_file)
        
        columns = list(asd.columns) 
        ind = [columns.index(i) for i in output_col]
        
        with open(i / output_folder / input_file, "r") as fp:
            samples = json.load(fp)  # [x_id, x_frame, y_id], id is related to ASD 
                   
        shutil.rmtree(i / output_folder / data_folder)
        Path(i / output_folder / data_folder).mkdir(exist_ok=True)
        for k in range(len(samples)):
            x_id, x_frame, y_id = samples[k]["x_id"], samples[k]["x_frame"], samples[k]["y_id"]
            result = asd.loc[((asd['track_id']==x_id) & (asd['frame'].isin(x_frame))), info_col+output_col]
                             
            # make sure the result is in ascendence order of 'frame'
            result = result.sort_values('frame')
            
            # check if there are NaNs in the result (which means openface failed to extract features), can do interpolation or just discard
            if InterpolateNaN:
                result.interpolate(inplace=True, limit_direction='both') # must have limit_direction='both'
            else:
                result.dropna(inplace=True)

            stat.append({"filename":"data_"+str(k), "x_id":x_id, "y_id":y_id, "x_frame":result['frame'].values.tolist(),
                         "x_pos":result[['bbox_xmin','bbox_ymin','bbox_xmax','bbox_ymax']].values.tolist()})
            data = np.float32(result.iloc[:,len(info_col):])
            output=(x_id, y_id, data)
            with open(i / output_folder/ data_folder / ("data_"+str(k)), "wb") as fp:
                pickle.dump(output, fp)
        with open(i / output_folder / output_file, 'w') as fp:
            json.dump(stat, fp)      
        print("finish "+str(i))   
        

start processing..\data\train\train_vimeo_562725349
finish ..\data\train\train_vimeo_562725349
start processing..\data\train\train_vimeo_611083383
finish ..\data\train\train_vimeo_611083383
start processing..\data\test\test_vimeo_414944373
finish ..\data\test\test_vimeo_414944373
start processing..\data\val\val_vimeo_429364395
finish ..\data\val\val_vimeo_429364395
