In [1]:
import pandas as pd
import re
import json
import os

In [2]:
# Get story text
with open('schoolmistress.txt') as f:
  schoolmistress_sentences = f.read().splitlines()

schoolmistress_words = []
for i in range(len(schoolmistress_sentences)):
  for word in schoolmistress_sentences[i].split(" "):
    word_cleaned = re.sub(r'[\",\(,\),\,,\;,\.,\?,\!,\:]', '', word)
    schoolmistress_words.append({
      "sentence": i,
      "word": word_cleaned,
      "word_length": len(word_cleaned)
    })

with open('expensivelessons.txt') as f:
  el_sentences = f.read().splitlines()

el_words = []
for i in range(len(el_sentences)):
  for word in el_sentences[i].split(" "):
    word_cleaned = re.sub(r'[\",\(,\),\,,\;,\.,\?,\!,\:]', '', word)
    el_words.append({
      "sentence": i,
      "word": word_cleaned,
      "word_length": len(word_cleaned)
    })


In [3]:
def convert_to_json(file_path):
  f = open(file_path, 'r')
  dict_list = f.readlines()
  return list(map(lambda x: json.loads(x), dict_list))

In [4]:
# get book nlp features
schoolmistress_emotion_json = convert_to_json('../emotional_story_arcs/data/kelsey/results/schoolmistress.emotion')
schoolmistress_sentiment = pd.read_csv('../emotional_story_arcs/data/kelsey/results/schoolmistress.sentiment', sep='\t', names=['negative', 'neutral', 'positive'])
schoolmistress_emotion = pd.DataFrame.from_dict(schoolmistress_emotion_json)

el_emotion_json = convert_to_json('../emotional_story_arcs/data/kelsey/results/expensivelessons.emotion')
el_sentiment = pd.read_csv('../emotional_story_arcs/data/kelsey/results/expensivelessons.sentiment', sep='\t', names=['negative', 'neutral', 'positive'])
el_emotion = pd.DataFrame.from_dict(el_emotion_json)

In [5]:
def process_logs(filename, sentiment, emotion, words):
  # append story text to eyelink file
  eyelink_data = pd.read_csv(f'ia_files/{filename}.txt', sep='\t', low_memory=False)
  eyelink_data.head()
  words_df = pd.DataFrame(words)
  ia_df_subset = eyelink_data[[ 'IA_DWELL_TIME', 'IA_REGRESSION_PATH_DURATION', 'IA_AVERAGE_FIX_PUPIL_SIZE', 
        'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT_FULL_COUNT']] \
        .applymap(lambda x: None if x == '.' else x)
  ia_df_clean = pd.DataFrame(ia_df_subset, dtype='float')
  eyelink_with_text = pd.concat([words_df, ia_df_clean], axis=1)

  # aggregate columns to get sentence vals
  ia_sentences_df = eyelink_with_text.groupby('sentence')\
      .agg({'IA_DWELL_TIME': 'sum', 'IA_REGRESSION_PATH_DURATION': 'sum',
                    'IA_AVERAGE_FIX_PUPIL_SIZE': 'mean', 'IA_REGRESSION_IN_COUNT': 'sum',
                    'IA_REGRESSION_OUT_FULL_COUNT': 'sum'})
  # Get highlight categories
  highlights = pd.read_csv(f'./highlights/{filename}.csv').drop(['Unnamed: 0', 'proportion'], axis=1)
  # BookNLP features + some eye tracking features in one DataFrame by sentence
  sentences_with_features = pd.concat([sentiment, emotion], axis=1)
  sentences_with_features = pd.concat([sentences_with_features, ia_sentences_df], axis=1)
  sentences_with_features = pd.concat([sentences_with_features, highlights], axis=1)
  sentences_with_features.to_csv(f"./results/{filename}.csv")

In [6]:
IA_DIR = "./ia_files/"

for filename in os.listdir(IA_DIR):
    f = os.path.join(IA_DIR,filename)
    if os.path.isfile(f):
        filename = f.replace(IA_DIR, '').replace('.txt', '')
        if 'schoolmistress' in filename:
          process_logs(filename, schoolmistress_sentiment, schoolmistress_emotion, schoolmistress_words)
        elif 'el' in filename:
          process_logs(filename, el_sentiment, el_emotion, el_words)
