In [None]:
"""
  Feature generation
  author: MP
  date: 4/29/2021
  
  The following code generates features from the dataset. 
  
  1. Create files to save the eye gaze features
      - csv files for tokens and types
      - pickle tokens mapped to features
      - pickle tokens mapped to POS tag
  2. Create feature templates in the format the we need for MINITAGGER (Stratos and Collins, 2015)
  3. Create new feature templates where features are normalized. This will give better performance
  with the model.
  
"""

In [1]:
import pandas as pd
import os
import re
import numpy as np
import pickle
from sklearn import preprocessing

In [None]:
"""
1. Get eye movement features from GECO data as by token and by type. The features used will be:

-WORD_FIXATION_%
-WORD_FIXATION_COUNT
-WORD_GAZE DURATION
-WORD_RUN_COUNT
-WORD_TOTAL_READING_TIME
-WORD_TOTAL_READING_TIME_%
-WORD_SKIP
-WORD_SPILLOVER
-WORD_FIRST_FIXATION_DURATION
-WORD_FIRST_FIXATION_DURATION + SECOND + THIRD + LAST / 4 (MEAN)
 
"""

In [2]:
PTH_DATA = 'C:/6120-NLP/project/data/geco-raw/dataset_filtered_unk.csv'
dataset_filtered = pd.read_csv(PTH_DATA)

In [3]:
PTH_KEY = 'C:/6120-NLP/project/data/geco-raw/keyset_unk.csv'
keyset = pd.read_csv(PTH_KEY)

In [4]:
# average over participant, so we have a feature for each token
dataset_tokens = dataset_filtered.groupby(['WORD_ID','WORD_CLEAN']).mean().reset_index()
#dataset_tokens.to_csv("C:/6120-NLP/project/token_features.csv", index = False, header=True)

In [5]:
# average over each token, so we have type feautres
dataset_types = dataset_tokens.groupby(['WORD_CLEAN']).mean().reset_index()
#dataset_types.to_csv("C:/6120-NLP/project/type_features.csv", index = False, header=True)

In [6]:
# create a type dictionary and dump to pickle to save for later (as back up to csv)
# will map word type to np array of eye movement features according to the order in the pd dataframe
type_dict = {}
for i in range(len(dataset_types)):
    f = np.array([dataset_types.loc[i, 'WORD_FIXATION_COUNT'],
             dataset_types.loc[i, 'WORD_FIXATION_%'],
             dataset_types.loc[i, 'WORD_GAZE_DURATION'],
             dataset_types.loc[i, 'WORD_FIRST_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_SECOND_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_THIRD_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME_%'],
             dataset_types.loc[i, 'WORD_SKIP'],
             dataset_types.loc[i, 'WORD_SPILLOVER'],
             dataset_types.loc[i, 'WORD_RUN_COUNT']])
    
    type_dict[dataset_types.loc[i, 'WORD_CLEAN']] = f

In [7]:
# save type dictionary to pickle
with open('type_dict.pickle', 'wb') as handle:
    pickle.dump(type_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# create pos dictionary for each type in case we want to check them later
# this isn't entirely accurate, as a word could represent several pos, but since we are using types we choose one
pos_dict = {}
for i in range(len(keyset)):
    pos_dict[keyset.loc[i, 'WORD']] = keyset.loc[i, 'PART_OF_SPEECH']

In [11]:
# save pos dictionary to pickle
with open('pos_dict.pickle', 'wb') as handle:
    pickle.dump(pos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
"""
5. Create feature files for eye gaze in the format that we need for MINITAGGER (Stratos and Collins, 2015).

Input feature template format:

[freq] [word type] [each value in feature vector]

"""

In [8]:
# make frequency dict
values = dataset_filtered['WORD_CLEAN'].value_counts().keys().tolist()
counts = dataset_filtered['WORD_CLEAN'].value_counts().tolist()

freq_dict = {}
for i in range(len(values)):
    freq_dict[values[i]] = counts[i]

In [25]:
# save freq dictionary to pickle
with open('freq_dict.pickle', 'wb') as handle:
    pickle.dump(freq_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# write an input file for the gaze features
def write_gaze_feature_input(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
    o = open(file_name, "w")
    try:
        for typ, feats in type_dict.items():
            feats_str = np.array2string(feats, separator=' ', max_line_width=10000, formatter={'float_kind':lambda feats: "%.8f" % feats})
            feats_str = feats_str[1:-1]
            o.write(str(freq_dict[typ]) + " " + str(typ) + " " + feats_str + "\n")
    except:
        o.close() 
    o.close() 
    
write_gaze_feature_input('C:/6120-NLP/project/data/gaze_feature_input.txt')

In [9]:
"""
3. Normalize the eye gaze features and then write a new file with normalized features. 

"""

def get_normalized_features():
    
    t_list = []
    f_list = []
    for typ, feats in type_dict.items():
        t_list.append(typ)
        f_list.append(feats)
        
    f_list_n = preprocessing.normalize(f_list)
    return t_list, f_list, f_list_n


type_list, feature_list, normalized_feature_list = get_normalized_features()
        

In [11]:
# write an input file for the gaze features once normalized
def write_normalized_gaze_feature_input(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
    o = open(file_name, "w")
    
    for i in range(len(type_list)):
        feats = normalized_feature_list[i]
        feats_str = np.array2string(feats, separator=' ', max_line_width=10000, formatter={'float_kind':lambda feats: "%.8f" % feats})
        feats_str = feats_str[1:-1]
        o.write(str(freq_dict[type_list[i]]) + " " + str(type_list[i]) + " " + feats_str + "\n")
    o.close() 
    
write_normalized_gaze_feature_input('C:/6120-NLP/project/data/norm_gaze_feature_input.txt')

In [12]:
# save normalized type features to a dictionary
normalized_type_dict = {}
for i in range(len(type_list)):
    normalized_type_dict[type_list[i]] = normalized_feature_list[i]
    

# save to pickle
with open('normalized_type_dictt.pickle', 'wb') as handle:
    pickle.dump(normalized_type_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)