In [93]:
import pandas as pd
import re
import json
import os
import numpy as np

In [94]:
SCALE_QUESTIONS = ['lastPageQuestionsRadio0','lastPageQuestionsRadio1','lastPageQuestionsRadio2','lastPageQuestionsRadio3','lastPageQuestionsRadio4','lastPageQuestionsRadio5','lastPageQuestionsRadio6','lastPageQuestionsRadio7','lastPageQuestionsRadio8']
SCALE_QUESTION_TEXT = ['I was curious about what would happen next.',
      'The story affected me emotionally.',
      'While reading my body was in the room, but my mind was inside the world created by the story.',
      'At times while reading, I wanted to know what the writer\'s intentions were.',
      'While reading, when a main character succeeded, I felt happy, and when they suffered in some way, I felt sad.',
      'The characters were alive in my imagination.',
      'I found my mind wandering while reading the story.',
      'I could vividly imagine the scenes in the story.',
      'At points, I had a hard time making sense of what was going on in the story.']
survey_results = pd.read_csv('survey_results.csv')
engagement_total = {'el': 0, 'schoolmistress': 0}

In [95]:
def compute_engagement_score(id, story):
  score = 0
  row = survey_results[(survey_results['participant_id'] == id) & (survey_results['story'] == story)]
  for i in SCALE_QUESTIONS:
    if i == 'lastPageQuestionsRadio6' or i == 'lastPageQuestionsRadio8':
      score -= (int(row[i]) - 1) # make scale 0 - 4 so that a 1 for negative quesitons does not affect total
    else:
      score += (int(row[i]) - 1)
  engagement_score = (np.round(score / 7, 2)) # divide by number of positive questions
  engagement_total[story] += engagement_score
  return engagement_score

In [96]:
def compute_concreteness_score(word):
  concreteness_dataset = pd.read_csv('../brysbaert_concreteness_scores_2013.csv')
  word_indexes = concreteness_dataset.loc[concreteness_dataset['Word'] == word, 'Conc.M']
  return "." if len(word_indexes) == 0 else word_indexes.iloc[0]

In [97]:
def get_ia_words(story):
  with open(f'{story}.txt') as f:
    sentences = f.read().splitlines()

  words = []
  for i in range(len(sentences)):
    for word in sentences[i].split(" "):
      word_cleaned = re.sub(r'[\",\(,\),\,,\;,\.,\?,\!,\:]', '', word).lower()
      words.append({
        "sentence": i,
        "word": word_cleaned,
        "word_length": len(word_cleaned)
      })
  return words

In [98]:
def get_story_sentences(story):
  with open(f'{story}.txt') as f:
    sentences = f.read().splitlines()

  tokens_df = pd.read_csv(f"../emotional_story_arcs/data/kelsey/results/{story}.tokens", sep= '\t', quoting=3)
  words = []
  for i in range(len(tokens_df)):
    word = tokens_df.iloc[i]
    lemma = word['lemma']
    if word['POS_tag'] != 'PUNCT':
      words.append({
        "sentence": word['sentence_ID'],
        "word": lemma,
        "word_length": len(word['word']),
        "concreteness": compute_concreteness_score(lemma)
      })
  return (sentences, words)

In [99]:
# Get story text
# schoolmistress_sentences, schoolmistress_words = get_story_sentences('schoolmistress')
# el_sentences, el_words = get_story_sentences('expensivelessons')
# schoolmistress_ia_words = get_ia_words('schoolmistress')
# el_ia_words = get_ia_words('expensivelessons')

# pd.DataFrame.from_dict(el_words).to_csv("el_words.csv")
# pd.DataFrame.from_dict(schoolmistress_words).to_csv("schoolmistress_words.csv")
# pd.DataFrame(el_sentences).to_csv("el_sentences.csv")
# pd.DataFrame(schoolmistress_sentences).to_csv("schoolmistress_sentences.csv")
# pd.DataFrame.from_dict(el_ia_words).to_csv("el_ia_words.csv")
# pd.DataFrame.from_dict(schoolmistress_ia_words).to_csv("schoolmistress_ia_words.csv")

In [100]:
el_words = pd.read_csv("el_words.csv")
schoolmistress_words = pd.read_csv("schoolmistress_words.csv")
el_sentences = pd.read_csv("el_sentences.csv")
schoolmistress_sentences = pd.read_csv("schoolmistress_sentences.csv")
el_ia_words= pd.read_csv("el_ia_words.csv")
schoolmistress_ia_words= pd.read_csv("schoolmistress_ia_words.csv")

In [101]:
def convert_to_json(file_path):
  f = open(file_path, 'r')
  dict_list = f.readlines()
  return list(map(lambda x: json.loads(x), dict_list))

In [102]:
def get_features_dataframe(story, words, sentences):
  word_features = pd.DataFrame.from_dict(words)
  word_features = word_features[['sentence', 'word_length', 'concreteness']].applymap(lambda x: None if x == '.' else x)
  word_features_clean = pd.DataFrame(word_features, dtype='float')
  sentence_features = word_features_clean.groupby('sentence').agg({'word_length': 'mean', 'concreteness': 'mean'})
  # get book nlp features
  emotion_json = convert_to_json(f'../emotional_story_arcs/data/kelsey/results/{story}.emotion')
  features_df = pd.concat([pd.DataFrame.from_dict(emotion_json), 
              pd.read_csv(f'../emotional_story_arcs/data/kelsey/results/{story}.sentiment', sep='\t', names=['negative', 'neutral', 'positive'])], axis=1)
  features_df = pd.concat([features_df, sentence_features], axis=1)
  return (pd.concat([features_df, sentences], axis=1))

In [103]:
def process_logs(filename, features, words):
  # append story text to eyelink file
  eyelink_data = pd.read_csv(f'ia_files/{filename}.txt', sep='\t', low_memory=False)
  eyelink_data2 = eyelink_data
  words_df = pd.DataFrame(words)
  ia_df_subset = eyelink_data[['IA_DWELL_TIME', 'IA_REGRESSION_PATH_DURATION', 'IA_AVERAGE_FIX_PUPIL_SIZE', 
        'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT_FULL_COUNT']] \
        .applymap(lambda x: None if x == '.' else x)
  ia_df_clean = pd.DataFrame(ia_df_subset, dtype='float')
  eyelink_with_text = pd.concat([words_df, ia_df_clean], axis=1)
  # aggregate columns to get sentence vals
  ia_sentences_df = eyelink_with_text.groupby('sentence')\
      .agg({ 'IA_DWELL_TIME': 'sum', 'IA_REGRESSION_PATH_DURATION': 'sum',
                    'IA_AVERAGE_FIX_PUPIL_SIZE': 'mean', 'IA_REGRESSION_IN_COUNT': 'sum',
                    'IA_REGRESSION_OUT_FULL_COUNT': 'sum', 'word': 'count'})
  # Get highlight categories
  highlights = pd.read_csv(f'./highlights/{filename}.csv').drop(['Unnamed: 0', 'proportion'], axis=1)
  # Story features + some eye tracking features in one DataFrame by sentence
  sentences_with_features = pd.concat([eyelink_data2[['RECORDING_SESSION_LABEL']][0:len(ia_sentences_df)], ia_sentences_df], axis=1)
  sentences_with_features = pd.concat([highlights, sentences_with_features], axis=1)
  sentences_with_features = pd.concat([sentences_with_features, features], axis=1)
  sentences_with_features['engagement_score'] = [compute_engagement_score(filename.split('_')[0], filename.split('_')[1].replace('.txt', ''))] * len(sentences_with_features)
  sentences_with_features.to_csv(f"./results/{filename}.csv", )
  return sentences_with_features

In [104]:
IA_DIR = "./ia_files/"
schoolmistress_features = get_features_dataframe('schoolmistress', schoolmistress_words, schoolmistress_sentences)
el_features = get_features_dataframe('expensivelessons', el_words, el_sentences)
for filename in os.listdir(IA_DIR):
  f = os.path.join(IA_DIR,filename)
  if os.path.isfile(f):
      filename = f.replace(IA_DIR, '').replace('.txt', '')
      if 'schoolmistress' in filename:
        process_logs(filename, schoolmistress_features, schoolmistress_ia_words)
      elif 'el' in filename:
        process_logs(filename, el_features, el_ia_words)

print(f"Expensive Lessons avg score: {np.round(engagement_total['el'] / 23, 2)}, schoolmistress avg score: {np.round(engagement_total['schoolmistress'] / 23, 2)}")


Expensive Lessons avg score: 2.09, schoolmistress avg score: 1.94


In [105]:
# fill NaN values
result_df = pd.read_csv("./results/id11_el.csv")
# fill_vals = {'IA_DWELL_TIME': 0, 'IA_REGRESSION_PATH_DURATION': 0,
#         'IA_AVERAGE_FIX_PUPIL_SIZE': result_df['IA_AVERAGE_FIX_PUPIL_SIZE'].mean(),
#         'IA_REGRESSION_IN_COUNT': 0, 'IA_REGRESSION_OUT_COUNT': 0}
        
fill_vals = {'IA_DWELL_TIME': 'empty', 'IA_REGRESSION_PATH_DURATION': 'empty',
        'IA_AVERAGE_FIX_PUPIL_SIZE': 'empty',
        'IA_REGRESSION_IN_COUNT': 'empty', 'IA_REGRESSION_OUT_FULL_COUNT': 'empty'}
result_df = result_df.fillna(value=fill_vals)
result_df = result_df[(result_df['IA_DWELL_TIME'] == 'empty') | \
         (result_df['IA_REGRESSION_PATH_DURATION'] == 'empty') | \
          (result_df['IA_AVERAGE_FIX_PUPIL_SIZE'] == 'empty') | \
            (result_df['IA_REGRESSION_IN_COUNT'] == 'empty') | \
            (result_df['IA_REGRESSION_OUT_FULL_COUNT'] == 'empty')
        ]

In [106]:
def populate_empties(df):
  fill_vals = {'IA_DWELL_TIME': 'empty', 'IA_REGRESSION_PATH_DURATION': 'empty',
        'IA_AVERAGE_FIX_PUPIL_SIZE': 'empty',
        'IA_REGRESSION_IN_COUNT': 'empty', 'IA_REGRESSION_OUT_FULL_COUNT': 'empty'}
  return df.fillna(value=fill_vals)

In [107]:
empty_report = result_df
for filename in os.listdir('./results'):
    f = os.path.join('./results',filename)
    if os.path.isfile(f) and f != './results/id11_el.csv':
      curr_df = populate_empties(pd.read_csv(f))
      curr_empties = curr_df[(curr_df['IA_DWELL_TIME'] == 'empty') | \
         (curr_df['IA_REGRESSION_PATH_DURATION'] == 'empty') | \
          (curr_df['IA_AVERAGE_FIX_PUPIL_SIZE'] == 'empty') | \
            (curr_df['IA_REGRESSION_IN_COUNT'] == 'empty') | \
            (curr_df['IA_REGRESSION_OUT_FULL_COUNT'] == 'empty')
        ]
      empty_report = pd.concat([empty_report, curr_empties], axis=0)

empty_report['tally'] = [1]*len(empty_report)
empty_report.to_csv("empty_rows.csv")


In [108]:
grouped_empty_report = empty_report.groupby('RECORDING_SESSION_LABEL').agg({"tally": 'sum'})
grouped_empty_report.to_csv("grouped_empty_report.csv")