In [None]:
"""
  Dataset construction
  author: MP
  date: 4/7/2021
  
  This study uses data from the GECO (Ghent Eye Tracking Corpus) which is available freely for use
  at: https://expsy.ugent.be/downloads/geco/.
  
  The following code extracts the features that are used in this study, and averages over word types.
  
"""

In [10]:
import pandas as pd
import os

In [2]:
# read in the dataset with eye tracking data
PTH_DATA = 'C:/6120-NLP/project/GECO/MonolingualReadingData.csv'
dataset = pd.read_csv(PTH_DATA)
dataset.head()

Unnamed: 0,PP_NR,GROUP,LANGUAGE_RANK,LANGUAGE,PART,TRIAL,TRIAL_FIXATION_COUNT,TRIAL_TOTAL_READING_TIME,WORD_ID_WITHIN_TRIAL,WORD_ID,...,WORD_LAST_FIXATION_RUN,WORD_LAST_FIXATION_TIME,WORD_LAST_FIXATION_X,WORD_LAST_FIXATION_Y,WORD_GO_PAST_TIME,WORD_SELECTIVE_GO_PAST_TIME,WORD_TOTAL_READING_TIME,WORD_TOTAL_READING_TIME_%,WORD_SPILLOVER,WORD_SKIP
0,pp21,monolingual,L1,English,1,5,115,25429,1,1-5-1,...,2,357,115.6,104.6,95,95,381,0.015,.,0
1,pp21,monolingual,L1,English,1,5,115,25429,2,1-5-2,...,3,1392,163.8,107.0,582,296,828,0.0326,.,1
2,pp21,monolingual,L1,English,1,5,115,25429,3,1-5-3,...,2,1957,229.4,96.6,1097,565,565,0.0222,.,1
3,pp21,monolingual,L1,English,1,5,115,25429,4,1-5-4,...,2,2474,356.0,103.8,2107,428,428,0.0168,.,0
4,pp21,monolingual,L1,English,1,5,115,25429,5,1-5-5,...,1,2808,403.2,114.5,154,154,154,0.0061,.,1


In [16]:
# read in key with data about words and sentences
PTH_KEY = 'C:/6120-NLP/project/GECO/EnglishMaterial.csv'
keyset = pd.read_csv(PTH_KEY)
keyset.head()

Unnamed: 0,WORD_ID,SENTENCE_ID,CHRON_ID,WORD,PART_OF_SPEECH,CONTENT_WORD,WORD_LENGTH,IA_AREA,IA_TOP,IA_BOTTOM,IA_LEFT,IA_RIGHT
0,1-5-1,1-1,1,The,Article,0,3,3087,93,142,66,129
1,1-5-2,1-1,2,intense,Adjective,1,7,3920,93,142,129,209
2,1-5-3,1-1,3,interest,Noun,1,8,4410,93,142,209,299
3,1-5-4,1-1,4,aroused,Verb,1,7,3969,93,142,299,380
4,1-5-5,1-1,5,in,Preposition,0,2,1421,93,142,380,409


In [17]:
"""
1. Map PART_OF_SPEECH to POS from universal tag set (Petrov et al, 2011) available at:
https://github.com/slavpetrov/universal-pos-tags

VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns 
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation

"""

print("Before mapping: ")
print("*******************************************")
print(keyset['PART_OF_SPEECH'].value_counts())
universal_tags = {'Verb' : 'VERB',
                  'Noun' : 'NOUN',
                  'Pronoun' : 'PRON',
                  'Preposition' : 'ADP',
                  'Adverb' : 'ADV',
                  'Article' : 'DET',
                  'Conjunction' : 'CONJ',
                  'Adjective' : 'ADJ',
                  'Determiner' : 'DET',
                  'To' : 'ADP',
                  'Name' : 'NOUN',
                  'Interjection' : 'X',
                  'Number' : 'NUM',
                  'Not' : 'PRT',
                  'Ex' : 'ADV',
                  'Letter' : 'X',
                  '.' : 'X',
                  'Unclassified' : 'X'
                 }

print("\nAfter mapping: ")
print("*******************************************")
keyset["PART_OF_SPEECH"].replace(universal_tags, inplace=True)
print(keyset['PART_OF_SPEECH'].value_counts())

Before mapping: 
*******************************************
Verb            11963
Noun             9356
Pronoun          8117
Preposition      4553
Adverb           4207
Article          4032
Conjunction      2592
Adjective        2561
Determiner       2394
To               1337
Name             1219
Interjection      654
Number            558
Not               530
Ex                243
Letter             17
.                  16
Unclassified       12
Name: PART_OF_SPEECH, dtype: int64

After mapping: 
*******************************************
VERB    11963
NOUN    10575
PRON     8117
DET      6426
ADP      5890
ADV      4450
CONJ     2592
ADJ      2561
X         699
NUM       558
PRT       530
Name: PART_OF_SPEECH, dtype: int64


In [18]:
keyset.head()

Unnamed: 0,WORD_ID,SENTENCE_ID,CHRON_ID,WORD,PART_OF_SPEECH,CONTENT_WORD,WORD_LENGTH,IA_AREA,IA_TOP,IA_BOTTOM,IA_LEFT,IA_RIGHT
0,1-5-1,1-1,1,The,DET,0,3,3087,93,142,66,129
1,1-5-2,1-1,2,intense,ADJ,1,7,3920,93,142,129,209
2,1-5-3,1-1,3,interest,NOUN,1,8,4410,93,142,209,299
3,1-5-4,1-1,4,aroused,VERB,1,7,3969,93,142,299,380
4,1-5-5,1-1,5,in,ADP,0,2,1421,93,142,380,409


In [25]:
"""
2. Create a file that we will need for clustering of the form:
 - each sentence as a line
 - each word seperated by white space
 
"""

# number of unique sentences that there should be
n = len(pd.unique(keyset['SENTENCE_ID']))
print(n)

5285


In [24]:
# write file

file_name = "C:/6120-NLP/project/input.txt"
if os.path.exists(file_name):
    os.remove(file_name)
sentences = open(file_name, "w")

old = keyset['SENTENCE_ID'][0]
line = ''
for i in range(len(keyset)):
    new = keyset.loc[i,'SENTENCE_ID']
    if old == new:
        line = line + str(keyset.loc[i, 'WORD']) + ' '
    else:
        line = line + '\n'
        sentences.write(line)
        line = str(keyset.loc[i, 'WORD']) + ' '
    old = new
    
line = line + '\n'
sentences.write(line)
sentences.close()
