In [None]:
"""
  Feature generation
  author: MP
  date: 4/29/2021
  
  The following code generates features from the dataset. 
  
  1. Create files to save the eye gaze features
      - csv files for tokens and types
      - pickle tokens mapped to features
      - pickle tokens mapped to POS tag
  2. Create feature templates in the format the we need for MINITAGGER (Stratos and Collins, 2015)
      - Brown clusters
      - eye gaze features
      - word embeddings
  
"""

In [None]:
import pandas as pd
import os
import re
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [None]:
"""
1. Get eye movement features from GECO data as by token and by type. The features used will be:

-WORD_FIXATION_%
-WORD_FIXATION_COUNT
-WORD_GAZE DURATION
-WORD_RUN_COUNT
-WORD_TOTAL_READING_TIME
-WORD_TOTAL_READING_TIME_%
-WORD_SKIP
-WORD_SPILLOVER
-WORD_FIRST_FIXATION_DURATION
-WORD_FIRST_FIXATION_DURATION + SECOND + THIRD + LAST / 4 (MEAN)
 
"""

In [None]:
PTH_DATA = 'C:/6120-NLP/project/data/geco-raw/gaze_features_all.csv'
dataset_filtered = pd.read_csv(PTH_DATA)

In [None]:
# average over participant, so we have a feature for each token
dataset_tokens = dataset_filtered.groupby(['WORD_ID','WORD_CLEAN']).mean().reset_index()
dataset_tokens.to_csv("C:/6120-NLP/project/token_features.csv", index = False, header=True)

In [None]:
# average over each token, so we have type feautres
dataset_types = dataset_tokens.groupby(['WORD_CLEAN']).mean().reset_index()
dataset_types.to_csv("C:/6120-NLP/project/type_features.csv", index = False, header=True)

In [None]:
# create a type dictionary and dump to pickle to save for later (as back up to csv)
# will map word type to np array of eye movement features according to the order in the pd dataframe
type_dict = {}
for i in range(len(dataset_types)):
    f = np.array([dataset_types.loc[i, 'WORD_FIXATION_COUNT'],
             dataset_types.loc[i, 'WORD_FIXATION_%'],
             dataset_types.loc[i, 'WORD_GAZE_DURATION'],
             dataset_types.loc[i, 'WORD_FIRST_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_SECOND_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_THIRD_FIXATION_DURATION'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME'],
             dataset_types.loc[i, 'WORD_TOTAL_READING_TIME_%'],
             dataset_types.loc[i, 'WORD_SKIP'],
             dataset_types.loc[i, 'WORD_SPILLOVER'],
             dataset_types.loc[i, 'WORD_RUN_COUNT']])
    
    type_dict[dataset_types.loc[i, 'WORD_CLEAN']] = f

In [None]:
# save type dictionary to pickle
with open('type_dict.pickle', 'wb') as handle:
    pickle.dump(type_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# create pos dictionary for each type in case we want to check them later
# this isn't entirely accurate, as a word could represent several pos, but since we are using types we choose one
pos_dict = {}
for i in range(len(keyset)):
    pos_dict[keyset.loc[i, 'WORD']] = keyset.loc[i, 'PART_OF_SPEECH']

In [None]:
# save pos dictionary to pickle
with open('pos_dict.pickle', 'wb') as handle:
    pickle.dump(pos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
"""
5. Create feature files in the format that we need for MINITAGGER (Stratos and Collins, 2015).

Input feature template format:

Bitstrings (from Brown clusters)
"""