-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extraction.py
255 lines (213 loc) · 12.7 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
## The main function is to preprocess the dataset from the Eye4TEmpathy paper,
## in this paper we have eye-tracker data and by extracting the raw data using
## this function your able to extract several important features per recording.
## This function has two parameters
## 1. path --> this path should contain all the recording, either test or control
## 2. fname --> this is recommended to be given by a 'os.listdir' loop
## The output of this is a dataframe of size (n_recording, n_features)
# The libraries needed are:
import pandas as pd
import numpy as np
import os
# ---------------------------------------------------------------------------
def get_avg_sacc_speed(df, group):
'''
Return average saccade speed
'''
temp = df[df['Recording name'] == group]
diff = temp['Saccade'].diff().values
# changes from 0-1 (i.e., diff = 1) mean start of saccade; changes from 1->0 (i.e. diff = -1) mean end of saccade
start_idx = np.where(diff == 1)[0]
end_idx = np.where(diff == -1)[0]
i = 0
while start_idx[0] > end_idx[i]:
# print(i, start_idx[0], end_idx[i])
i += 1
end_idx = end_idx[i:]
speeds = []
for start, end in zip(start_idx, end_idx):
assert end > start
speeds.append(np.nanmean(temp['Speed'].iloc[start:end+1].values)) # average speed/saccade
return len(speeds), np.nanmean(np.asarray(speeds)) # number of saccades and average speed across saccades
def get_avg_fix_duration(df, group, srate=120):
'''
Return average fixation duration
param df: Dataframe with original recording
param group: name of recording session
param srate: sampling rate of data (default in dataset is 120 Hz)
'''
temp = df[df['Recording name'] == group]
diff = temp['Fixation'].diff().values
# changes from 0-1 (i.e., diff = 1) indicate start of fixation; changes from 1->0 (i.e. diff = -1) indicate end of fixation
start_idx = np.where(diff == 1)[0]
end_idx = np.where(diff == -1)[0]
i = 0
while start_idx[0] > end_idx[i]:
# print(i, start_idx[0], end_idx[i])
i += 1
end_idx = end_idx[i:]
durations = []
for start, end in zip(start_idx, end_idx):
assert end > start
durations.append((end-start+1)/srate) # duration of fixation (number of rows/sampling rate)
return len(durations), np.asarray(durations).mean() # number of fixations and average duration across fixations
def get_unclassified_count(df, group, srate=120):
temp = df[df['Recording name'] == group]
diff = temp['Unclassified'].diff().values
# changes from 0-1 (i.e., diff = 1) indicate start of unclassified; changes from 1->0 (i.e. diff = -1) indicate end of unclassified
start_idx = np.where(diff == 1)[0]
end_idx = np.where(diff == -1)[0]
i = 0
while start_idx[0] > end_idx[i]:
# print(i, start_idx[0], end_idx[i])
i += 1
end_idx = end_idx[i:]
durations = []
for start, end in zip(start_idx, end_idx):
assert end > start
durations.append((end - start + 1) / srate) # duration of fixation (number of rows/sampling rate)
return len(durations), np.asarray(durations).mean() # number of fixations and average duration across fixations
# ---------------------------------------------------------------------------
def preprocess(path, fname):
# ------------------------------------------
# READING FILE
# ------------------------------------------
# Read the .tsv file that contains the raw data of the participant
df_table = pd.read_table(path + fname, sep='\t', low_memory=False)
# Remove calibration points in recording
startPoints = df_table[df_table['Event'] == 'ImageStimulusStart'].index.values.astype(int)
endPoints = df_table[df_table['Event'] == 'ImageStimulusEnd'].index.values.astype(int)
# Store only image stimulus
df = pd.DataFrame()
for i in range(len(startPoints)):
start = startPoints[i]
end = endPoints[i]
trial = df_table.iloc[start:end+1]
df = pd.concat([df, trial])
# Select correctly the participant in loop
partiName = int(fname[13:-4])
print('Participant #', partiName)
# Features we are keeping
df_col = ['Recording timestamp', 'Participant name', 'Recording name', 'Recording duration',
'Pupil diameter left', 'Pupil diameter right', 'Gaze point X (MCSnorm)', 'Gaze point Y (MCSnorm)',
'Eye movement type', 'Gaze event duration', 'Fixation point X (MCSnorm)', 'Fixation point Y (MCSnorm)']
# Remove unnecessary columns
df_features = df[df_col]
# ------------------------------------------
# Feature processing and correction
# ------------------------------------------
# Change Recording name to integer
record_name = df_features['Recording name'].unique()
for i in range(len(record_name)):
df_features = df_features.replace(record_name[i], i)
# Change Participant name to integer
prev = df_features['Participant name'].unique().tolist()
part_name = int(df_features['Participant name'].unique().tolist()[-1][13:15])
# Check that we're saving the right participant name (one is from the filename, the other is from the file
assert part_name == partiName, "Participant numbers don't match! %d != %d" % (partiName, part_name)
df_features['Participant name'] = df_features['Participant name'].replace(prev, part_name)
# # Label encoder for feature --> 'Eye movement type'
df_features['Eye movement type'] = df_features['Eye movement type'].replace(("EyesNotFound", np.nan), "Unclassified")
# Columns that need to be changed from object to float
objColumns = ['Pupil diameter left', 'Pupil diameter right', 'Gaze point X (MCSnorm)',
'Gaze point Y (MCSnorm)', 'Fixation point X (MCSnorm)', 'Fixation point Y (MCSnorm)']
# Change (commas) to (decimals) and convert object to float64
for feature in objColumns:
df_features[feature] = df_features[feature].str.replace(',', '.').astype(float)
# Create distance and time columns for SPEED and ACC
#df_features['Time'] = pd.to_timedelta(df_features['Recording timestamp'], unit='us')
df_features['Time'] = pd.to_datetime(df_features['Recording timestamp']).astype(np.int64) / int(1e6) # seconds
df_features['Delta Time'] = df_features['Time'].diff()
df_features['Position'] = np.sqrt(df_features['Gaze point X (MCSnorm)']**2 + df_features['Gaze point Y (MCSnorm)']**2)
df_features['Speed'] = df_features['Position'].diff() / df_features['Delta Time']
df_features['Acceleration'] = df_features['Speed'].diff() / df_features['Delta Time']
# Create Average Fixation Speed Feature
df_features['Fixation'] = [1 if i == 'Fixation' else 0 for i in df_features['Eye movement type'].values]
df_features['Saccade'] = [1 if i == 'Saccade' else 0 for i in df_features['Eye movement type'].values]
df_features['Unclassified'] = [1 if i == 'Unclassified' else 0 for i in df_features['Eye movement type'].values]
# ------------------------------------------------
# Group by recording and extracting new features
# -----------------------------------------------
grouped_data = df_features.groupby('Recording name')
df_recordings = pd.DataFrame()
for name, group in grouped_data:
# Creation of features from big dataset
recDur = group['Recording duration'].unique().tolist()[0]
gazeAvg = group['Gaze event duration'].mean()
num_fixations, avg_fix_duration = get_avg_fix_duration(df_features, name, srate=120)
num_saccades, avg_sacc_speed = get_avg_sacc_speed(df_features, name)
num_unclassified, avg_unclassified_duration = get_unclassified_count(df_features, name, srate=120)
# Dictionary with features extracted
feature_dict = {'Recording name' : name,
'Participant name' : partiName,
'Mean Pupil diameter left' : group['Pupil diameter left'].mean(),
'Std Pupil diameter left' : group['Pupil diameter left'].std(),
'Min Pupil diamater left' : group['Pupil diameter left'].min(),
'Max Pupil diamater left' : group['Pupil diameter left'].max(),
'Mean Pupil diameter right': group['Pupil diameter right'].mean(),
'Std Pupil diameter right' : group['Pupil diameter right'].std(),
'Min Pupil diamater right' : group['Pupil diameter right'].min(),
'Max Pupil diamater right' : group['Pupil diameter right'].max(),
'Num. of Fixations' : num_fixations,
'Num. of Saccades' : num_saccades,
'Num. of Unclassified' : num_unclassified,
'Recording duration (s)' : (recDur/1000),
'Mean Gaze event duration (s)': (gazeAvg/1000),
'Mean Fixation point X' : group['Fixation point X (MCSnorm)'].mean(),
'Std Fixation point X' : group['Fixation point X (MCSnorm)'].std(),
'Mean Fixation point Y' : group['Fixation point Y (MCSnorm)'].mean(),
'Std Fixation point Y' : group['Fixation point Y (MCSnorm)'].std(),
'Mean Gaze point X' : group['Gaze point X (MCSnorm)'].mean(),
'Std Gaze point X' : group['Gaze point X (MCSnorm)'].std(),
'Mean Gaze point Y' : group['Gaze point Y (MCSnorm)'].mean(),
'Std Gaze point Y' : group['Gaze point Y (MCSnorm)'].std(),
'Speed' : group['Speed'].mean(),
'Acceleration' : group['Acceleration'].mean(),
'Avg Saccade Speed' : avg_sacc_speed,
'Avg Fix Duration' : avg_fix_duration,
'Avg Unclassif Duration' : avg_unclassified_duration,
'Empathy Score' : 0
}
# Append the features for this recording name to the feature dataframe
df_recordings = df_recordings.append(feature_dict, ignore_index=True)
# Set the recording name as the index
df_recordings.set_index('Recording name', inplace=True)
# If rows cointain nan values, we drop them because it means the recording was unsuccessful.
# df_recordings = df_recordings.dropna(axis=0)
return df_recordings
#---------------------------------------------------------------------------------------------------
## Function that becomes helpful for the appending of the target in the df
## Transforms a list of list into a single list, the parameter is the list of lists
def flatten(l):
return [item for sublist in l for item in sublist]
# ------------------------------------------------------------------------
# This function automatizes the selection of the participant group
def select_group(grp):
if grp == 'test':
path = 'C:\\Users\\mverd\\Desktop\\IMD\\ESSEX\\TERM2\\Modules\\Data Science and Decision Making\\Assignment2_Final\\rawdata\\test\\'
groupSelect = 1
elif grp == 'control':
path = 'C:\\Users\\mverd\\Desktop\\IMD\\ESSEX\\TERM2\\Modules\\Data Science and Decision Making\\Assignment2_Final\\rawdata\\control\\'
groupSelect = 2
return path, groupSelect
#---------------------------------------------------------------------------------
## This function reads both of the questionnaries and creates the label for the
## model to predict
## The parameters are the path we the questionaries are store, and the current
## group selected (test or control)
def label(pathQ, groupSelect):
# Read first and second questionnarie
quest1 = pd.read_csv(pathQ + os.listdir(pathQ)[0], encoding= 'unicode_escape', low_memory=False)
# quest2 = pd.read_csv(pathQ + os.listdir(pathQ)[1], encoding= 'unicode_escape', low_memory=False)
# Extract labels to predict (Original Score, before experiment)
score = quest1.iloc[:,-2]
# Need to store the correct indexes (odd or even)
if groupSelect == 1:
list_par = list(range(0,59,2))
elif groupSelect == 2:
list_par = list(range(1,60,2))
# Assign only the desired values
label = score[list_par]
# Drop index so that we always have from 0-->29
label = label.reset_index(drop=True)
return label