In [None]:
"""
  Dataset construction
  author: MP
  date: 4/7/2021
  
  This study uses data from the GECO (Ghent Eye Tracking Corpus) which is available freely for use
  at: https://expsy.ugent.be/downloads/geco/.
  
  The following code extracts the features that are used in this study, and averages over word types.
  
"""

In [27]:
import pandas as pd
import os
import re
import numpy as np
import pickle

In [2]:
# read in the dataset with eye tracking data
PTH_DATA = 'C:/6120-NLP/project/GECO/MonolingualReadingData.csv'
dataset = pd.read_csv(PTH_DATA)

# replace - with _ to avoid excel data reformatting
def fix_id(w):
    w = str(w)
    w = w.replace("-", "_")
    return w

dataset['WORD_ID'] = dataset['WORD_ID'].apply(fix_id)

# replace null values, which are represented by a dot, with an empty cell
# also change values that we need to floats (some are strings)
def fix_nulls(c):
    if c == ".":
        c = 0
    return float(c)

dataset['WORD_FIXATION_%'] = dataset['WORD_FIXATION_%'].apply(fix_nulls)
dataset['WORD_FIXATION_COUNT'] = dataset['WORD_FIXATION_COUNT'].apply(fix_nulls)
dataset['WORD_GAZE_DURATION'] = dataset['WORD_GAZE_DURATION'].apply(fix_nulls)
dataset['WORD_RUN_COUNT'] = dataset['WORD_RUN_COUNT'].apply(fix_nulls)
dataset['WORD_SKIP'] = dataset['WORD_SKIP'].apply(fix_nulls)
dataset['WORD_FIRST_FIXATION_DURATION'] = dataset['WORD_FIRST_FIXATION_DURATION'].apply(fix_nulls)
dataset['WORD_SECOND_FIXATION_DURATION'] = dataset['WORD_SECOND_FIXATION_DURATION'].apply(fix_nulls)
dataset['WORD_THIRD_FIXATION_DURATION'] = dataset['WORD_THIRD_FIXATION_DURATION'].apply(fix_nulls)
dataset['WORD_TOTAL_READING_TIME'] = dataset['WORD_TOTAL_READING_TIME'].apply(fix_nulls)
dataset['WORD_TOTAL_READING_TIME_%'] = dataset['WORD_TOTAL_READING_TIME_%'].apply(fix_nulls)
dataset['WORD_SPILLOVER'] = dataset['WORD_SPILLOVER'].apply(fix_nulls)

dataset.head()

Unnamed: 0,PP_NR,GROUP,LANGUAGE_RANK,LANGUAGE,PART,TRIAL,TRIAL_FIXATION_COUNT,TRIAL_TOTAL_READING_TIME,WORD_ID_WITHIN_TRIAL,WORD_ID,...,WORD_LAST_FIXATION_RUN,WORD_LAST_FIXATION_TIME,WORD_LAST_FIXATION_X,WORD_LAST_FIXATION_Y,WORD_GO_PAST_TIME,WORD_SELECTIVE_GO_PAST_TIME,WORD_TOTAL_READING_TIME,WORD_TOTAL_READING_TIME_%,WORD_SPILLOVER,WORD_SKIP
0,pp21,monolingual,L1,English,1,5,115,25429,1,1_5_1,...,2,357,115.6,104.6,95,95,381.0,0.015,0.0,0.0
1,pp21,monolingual,L1,English,1,5,115,25429,2,1_5_2,...,3,1392,163.8,107.0,582,296,828.0,0.0326,0.0,1.0
2,pp21,monolingual,L1,English,1,5,115,25429,3,1_5_3,...,2,1957,229.4,96.6,1097,565,565.0,0.0222,0.0,1.0
3,pp21,monolingual,L1,English,1,5,115,25429,4,1_5_4,...,2,2474,356.0,103.8,2107,428,428.0,0.0168,0.0,0.0
4,pp21,monolingual,L1,English,1,5,115,25429,5,1_5_5,...,1,2808,403.2,114.5,154,154,154.0,0.0061,0.0,1.0


In [3]:
# read in key with data about words and sentences
PTH_KEY = 'C:/6120-NLP/project/GECO/EnglishMaterial.csv'
keyset = pd.read_csv(PTH_KEY)
keyset['WORD_ID'] = keyset['WORD_ID'].apply(fix_id)
keyset['SENTENCE_ID'] = keyset['SENTENCE_ID'].apply(fix_id)

keyset.head()

Unnamed: 0,WORD_ID,SENTENCE_ID,CHRON_ID,WORD,PART_OF_SPEECH,CONTENT_WORD,WORD_LENGTH,IA_AREA,IA_TOP,IA_BOTTOM,IA_LEFT,IA_RIGHT
0,1_5_1,1_1,1,The,Article,0,3,3087,93,142,66,129
1,1_5_2,1_1,2,intense,Adjective,1,7,3920,93,142,129,209
2,1_5_3,1_1,3,interest,Noun,1,8,4410,93,142,209,299
3,1_5_4,1_1,4,aroused,Verb,1,7,3969,93,142,299,380
4,1_5_5,1_1,5,in,Preposition,0,2,1421,93,142,380,409


In [4]:
"""
1. Map PART_OF_SPEECH to POS from universal tag set (Petrov et al, 2011) available at:
https://github.com/slavpetrov/universal-pos-tags

VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns 
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation

"""

print("Before mapping: ")
print("*******************************************")
print(keyset['PART_OF_SPEECH'].value_counts())
universal_tags = {'Verb' : 'VERB',
                  'Noun' : 'NOUN',
                  'Pronoun' : 'PRON',
                  'Preposition' : 'ADP',
                  'Adverb' : 'ADV',
                  'Article' : 'DET',
                  'Conjunction' : 'CONJ',
                  'Adjective' : 'ADJ',
                  'Determiner' : 'DET',
                  'To' : 'ADP',
                  'Name' : 'NOUN',
                  'Interjection' : 'X',
                  'Number' : 'NUM',
                  'Not' : 'PRT',
                  'Ex' : 'ADV',
                  'Letter' : 'X',
                  '.' : 'X',
                  'Unclassified' : 'X'
                 }

print("\nAfter mapping: ")
print("*******************************************")
keyset["PART_OF_SPEECH"].replace(universal_tags, inplace=True)
print(keyset['PART_OF_SPEECH'].value_counts())

Before mapping: 
*******************************************
Verb            11963
Noun             9356
Pronoun          8117
Preposition      4553
Adverb           4207
Article          4032
Conjunction      2592
Adjective        2561
Determiner       2394
To               1337
Name             1219
Interjection      654
Number            558
Not               530
Ex                243
Letter             17
.                  16
Unclassified       12
Name: PART_OF_SPEECH, dtype: int64

After mapping: 
*******************************************
VERB    11963
NOUN    10575
PRON     8117
DET      6426
ADP      5890
ADV      4450
CONJ     2592
ADJ      2561
X         699
NUM       558
PRT       530
Name: PART_OF_SPEECH, dtype: int64


In [5]:
keyset.head()

Unnamed: 0,WORD_ID,SENTENCE_ID,CHRON_ID,WORD,PART_OF_SPEECH,CONTENT_WORD,WORD_LENGTH,IA_AREA,IA_TOP,IA_BOTTOM,IA_LEFT,IA_RIGHT
0,1_5_1,1_1,1,The,DET,0,3,3087,93,142,66,129
1,1_5_2,1_1,2,intense,ADJ,1,7,3920,93,142,129,209
2,1_5_3,1_1,3,interest,NOUN,1,8,4410,93,142,209,299
3,1_5_4,1_1,4,aroused,VERB,1,7,3969,93,142,299,380
4,1_5_5,1_1,5,in,ADP,0,2,1421,93,142,380,409


In [6]:
"""
2. Create a file that we will need for clustering of the form:
 - each sentence as a line
 - each word seperated by white space
 
"""

# number of unique sentences that there should be
n = len(pd.unique(keyset['SENTENCE_ID']))
print(n)

5285


In [7]:
# write file

file_name = "C:/6120-NLP/project/input.txt"
if os.path.exists(file_name):
    os.remove(file_name)
sentences = open(file_name, "w")

old = keyset['SENTENCE_ID'][0]
line = ''
for i in range(len(keyset)):
    new = keyset.loc[i,'SENTENCE_ID']
    if old == new:
        line = line + str(keyset.loc[i, 'WORD']) + ' '
    else:
        line = line + '\n'
        sentences.write(line)
        line = str(keyset.loc[i, 'WORD']) + ' '
    old = new
    
line = line + '\n'
sentences.write(line)
sentences.close()


In [None]:
"""
3. Get eye movement features as by token and by type. The features used will be:

-WORD_FIXATION_%
-WORD_FIXATION_COUNT
-WORD_GAZE DURATION
-WORD_RUN_COUNT
-WORD_TOTAL_READING_TIME
-WORD_TOTAL_READING_TIME_%
-WORD_SKIP
-WORD_SPILLOVER
-WORD_FIRST_FIXATION_DURATION
-WORD_FIRST_FIXATION_DURATION + SECOND + THIRD + LAST / 4 (MEAN)
 
"""

In [8]:
# drop some columns that we don't need
dataset_filtered = dataset[['PP_NR','PART', 'TRIAL',
                           'WORD_ID', 'WORD', 'WORD_FIXATION_COUNT', 'WORD_FIXATION_%',
                           'WORD_GAZE_DURATION', 'WORD_FIRST_FIXATION_DURATION', 'WORD_SECOND_FIXATION_DURATION',
                           'WORD_THIRD_FIXATION_DURATION', 'WORD_TOTAL_READING_TIME', 'WORD_TOTAL_READING_TIME_%',
                           'WORD_SKIP', 'WORD_SPILLOVER', 'WORD_RUN_COUNT']]

dataset_filtered.head()

Unnamed: 0,PP_NR,PART,TRIAL,WORD_ID,WORD,WORD_FIXATION_COUNT,WORD_FIXATION_%,WORD_GAZE_DURATION,WORD_FIRST_FIXATION_DURATION,WORD_SECOND_FIXATION_DURATION,WORD_THIRD_FIXATION_DURATION,WORD_TOTAL_READING_TIME,WORD_TOTAL_READING_TIME_%,WORD_SKIP,WORD_SPILLOVER,WORD_RUN_COUNT
0,pp21,1,5,1_5_1,The,2.0,0.0174,95.0,95.0,286.0,0.0,381.0,0.015,0.0,0.0,2.0
1,pp21,1,5,1_5_2,intense,3.0,0.0261,54.0,54.0,242.0,532.0,828.0,0.0326,1.0,0.0,3.0
2,pp21,1,5,1_5_3,interest,2.0,0.0174,333.0,333.0,232.0,0.0,565.0,0.0222,1.0,0.0,2.0
3,pp21,1,5,1_5_4,aroused,3.0,0.0261,78.0,78.0,215.0,135.0,428.0,0.0168,0.0,0.0,2.0
4,pp21,1,5,1_5_5,in,1.0,0.0087,154.0,154.0,0.0,0.0,154.0,0.0061,1.0,0.0,1.0


In [9]:
# drop rows with words that were removed
word_ids = keyset['WORD_ID'].tolist()
dataset_filtered = dataset_filtered[dataset_filtered['WORD_ID'].isin(word_ids)]

In [10]:
# clean each word so that is does not have punctuation
def clean_word(w):
    try:
        w = re.sub(r'[^\w\s]', '', w)
    except:
        pass
    return w
dataset_filtered['WORD_CLEAN'] = dataset_filtered['WORD'].apply(clean_word)

In [11]:
dataset_filtered.head()

Unnamed: 0,PP_NR,PART,TRIAL,WORD_ID,WORD,WORD_FIXATION_COUNT,WORD_FIXATION_%,WORD_GAZE_DURATION,WORD_FIRST_FIXATION_DURATION,WORD_SECOND_FIXATION_DURATION,WORD_THIRD_FIXATION_DURATION,WORD_TOTAL_READING_TIME,WORD_TOTAL_READING_TIME_%,WORD_SKIP,WORD_SPILLOVER,WORD_RUN_COUNT,WORD_CLEAN
0,pp21,1,5,1_5_1,The,2.0,0.0174,95.0,95.0,286.0,0.0,381.0,0.015,0.0,0.0,2.0,The
1,pp21,1,5,1_5_2,intense,3.0,0.0261,54.0,54.0,242.0,532.0,828.0,0.0326,1.0,0.0,3.0,intense
2,pp21,1,5,1_5_3,interest,2.0,0.0174,333.0,333.0,232.0,0.0,565.0,0.0222,1.0,0.0,2.0,interest
3,pp21,1,5,1_5_4,aroused,3.0,0.0261,78.0,78.0,215.0,135.0,428.0,0.0168,0.0,0.0,2.0,aroused
4,pp21,1,5,1_5_5,in,1.0,0.0087,154.0,154.0,0.0,0.0,154.0,0.0061,1.0,0.0,1.0,in


In [12]:
# save the filtered features for each participant
dataset_filtered.to_csv("C:/6120-NLP/project/gaze_features.csv", index = False, header=True)

In [None]:
# average over participant, so we have a feature for each token
dataset_tokens = dataset_filtered.groupby(['WORD_ID','WORD_CLEAN']).mean().reset_index()
dataset_tokens.to_csv("C:/6120-NLP/project/token_features.csv", index = False, header=True)

In [None]:
# average over each token, so we have type feautres
dataset_types = dataset_tokens.groupby(['WORD_CLEAN']).mean().reset_index()
dataset_types.to_csv("C:/6120-NLP/project/type_features.csv", index = False, header=True)

In [19]:
# create a type dictionary and dump to pickle to use in the model
# will map word type to np array of eye movement features according to the order in the pd dataframe
type_dict = {}
for i in range(len(dataset_types)):
    f = np.array([dataset_types.loc[i, 'WORD_FIXATION_COUNT'],
             dataset_types.loc[i, 'WORD_FIXATION_%'],
             dataset_types.loc[i, 'WORD_GAZE_DURATION'],
             dataset_types.loc[i, 'WORD_FIRST_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_SECOND_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_THIRD_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME_%'],
             dataset_types.loc[i, 'WORD_SKIP'],
             dataset_types.loc[i, 'WORD_SPILLOVER'],
             dataset_types.loc[i, 'WORD_RUN_COUNT']])
    
    type_dict[dataset_types.loc[i, 'WORD_CLEAN']] = f

In [28]:
# save type dictionary to pickle
with open('type_dict.pickle', 'wb') as handle:
    pickle.dump(type_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)