In [52]:
import pandas as pd
import re
import json
import os
import numpy as np

In [53]:
SCALE_QUESTIONS = ['lastPageQuestionsRadio0','lastPageQuestionsRadio1','lastPageQuestionsRadio2','lastPageQuestionsRadio3','lastPageQuestionsRadio4','lastPageQuestionsRadio5','lastPageQuestionsRadio6','lastPageQuestionsRadio7','lastPageQuestionsRadio8']
SCALE_QUESTION_TEXT = ['I was curious about what would happen next.',
      'The story affected me emotionally.',
      'While reading my body was in the room, but my mind was inside the world created by the story.',
      'At times while reading, I wanted to know what the writer\'s intentions were.',
      'While reading, when a main character succeeded, I felt happy, and when they suffered in some way, I felt sad.',
      'The characters were alive in my imagination.',
      'I found my mind wandering while reading the story.',
      'I could vividly imagine the scenes in the story.',
      'At points, I had a hard time making sense of what was going on in the story.']
survey_results = pd.read_csv('survey_results.csv')
engagement_total = {'el': 0, 'schoolmistress': 0}
OUTLIER_COUNT = 0

In [54]:
def compute_engagement_score(id, story):
  score = 0
  row = survey_results[(survey_results['participant_id'] == id) & (survey_results['story'] == story)]
  for i in SCALE_QUESTIONS:
    if i == 'lastPageQuestionsRadio6' or i == 'lastPageQuestionsRadio8':
      score -= (int(row[i]) - 1) # make scale 0 - 4 so that a 1 for negative quesitons does not affect total
    else:
      score += (int(row[i]) - 1)
  engagement_score = (np.round(score / 7, 2)) # divide by number of positive questions
  engagement_total[story] += engagement_score
  return engagement_score

In [55]:
def compute_concreteness_score(word):
  concreteness_dataset = pd.read_csv('../brysbaert_concreteness_scores_2013.csv')
  word_indexes = concreteness_dataset.loc[concreteness_dataset['Word'] == word, 'Conc.M']
  return "." if len(word_indexes) == 0 else word_indexes.iloc[0]

In [56]:
def compute_valence_scores(word):
  valence_dataset = pd.read_csv('./NRC-VAD-Lexicon.csv', sep='\t')
  word_indexes_valence = valence_dataset.loc[valence_dataset['word'] == word.lower(), 'valence']
  word_indexes_arousal = valence_dataset.loc[valence_dataset['word'] == word.lower(), 'arousal']
  return (".",".") if len(word_indexes_valence) == 0 else (word_indexes_valence.iloc[0], word_indexes_arousal.iloc[0])

In [57]:
def get_word_freq(word):
    freq_df = pd.read_csv('subtlex.csv')
    zipf_scores = freq_df.loc[freq_df['Word'] == word.lower(), 'Zipf-value']
    return zipf_scores.iloc[0] if len(zipf_scores > 0) else 1.5 ## TODO: get better fallback freq

In [70]:
def get_ia_words(story):
  with open(f'{story}.txt') as f:
    sentences = f.read().splitlines()

  words = []
  for i in range(len(sentences)):
    for word in sentences[i].split(" "):
      word_cleaned = re.sub(r'[\",\(,\),\,,\;,\.,\?,\!,\:]', '', word).lower()
      words.append({
        "sentence": i,
        "word": word_cleaned,
        "word_length": len(word_cleaned),
        "word_freq": get_word_freq(word_cleaned),
      })
  return words

In [59]:
def get_story_sentences(story):
  with open(f'{story}.txt') as f:
    sentences = f.read().splitlines()

  tokens_df = pd.read_csv(f"../emotional_story_arcs/data/kelsey/results/{story}.tokens", sep= '\t', quoting=3)
  # filter out punctuation
  tokens_df = tokens_df[tokens_df['POS_tag'] != 'PUNCT']
  tokens_df['sentence'] = tokens_df['sentence_ID']
  tokens_df['word'] = tokens_df['lemma']
  tokens_df['word_length'] = tokens_df['lemma'].apply(lambda x: len(x))
  tokens_df['concreteness'] = tokens_df['lemma'].apply(lambda x: compute_concreteness_score(x))
  tokens_df['valence'] = tokens_df['lemma'].apply(lambda x: compute_valence_scores(x)[0])
  tokens_df['arousal'] = tokens_df['lemma'].apply(lambda x: compute_valence_scores(x)[1])
  tokens_df['word_freq'] = tokens_df['lemma'].apply(lambda x: get_word_freq(x))

  return (sentences, tokens_df)

In [71]:
# Get story text
# schoolmistress_sentences, schoolmistress_words = get_story_sentences('schoolmistress')
# el_sentences, el_words = get_story_sentences('expensivelessons')
# schoolmistress_ia_words = get_ia_words('schoolmistress')
# el_ia_words = get_ia_words('expensivelessons')

# el_words.to_csv("el_words.csv")
# schoolmistress_words.to_csv("schoolmistress_words.csv")
# pd.DataFrame.from_dict(el_sentences).to_csv("el_sentences.csv")
# pd.DataFrame.from_dict(schoolmistress_sentences).to_csv("schoolmistress_sentences.csv")
# pd.DataFrame.from_dict(el_ia_words).to_csv("el_ia_words.csv")
# pd.DataFrame.from_dict(schoolmistress_ia_words).to_csv("schoolmistress_ia_words.csv")

In [61]:
el_words = pd.read_csv("el_words.csv")
schoolmistress_words = pd.read_csv("schoolmistress_words.csv")
el_sentences = pd.read_csv("el_sentences.csv")
schoolmistress_sentences = pd.read_csv("schoolmistress_sentences.csv")
el_ia_words= pd.read_csv("el_ia_words.csv")
schoolmistress_ia_words= pd.read_csv("schoolmistress_ia_words.csv")

In [62]:
schoolmistress_valence = []
schoolmistress_arousal = []
for index, row in schoolmistress_words.iterrows():
    val_ar = compute_valence_scores(row['word'])
    schoolmistress_valence.append(val_ar[0])
    schoolmistress_arousal.append(val_ar[1])


In [63]:
el_valence = []
el_arousal = []
for index, row in el_words.iterrows():
    val_ar = compute_valence_scores(row['word'])
    el_valence.append(val_ar[0])
    el_arousal.append(val_ar[1])


In [64]:
schoolmistress_words['valence'] = schoolmistress_valence
schoolmistress_words['arousal'] = schoolmistress_arousal
el_words['valence'] = el_valence
el_words['arousal'] = el_arousal

In [65]:
def convert_to_json(file_path):
  f = open(file_path, 'r')
  dict_list = f.readlines()
  return list(map(lambda x: json.loads(x), dict_list))

In [82]:
def get_features_dataframe(story, words, sentences):
  word_features = pd.DataFrame.from_dict(words)
  word_features = word_features[['sentence', 'word_length', 'concreteness', 'valence', 'arousal', 'word_freq']].applymap(lambda x: None if x == '.' else x)
  word_features_clean = pd.DataFrame(word_features, dtype='float')
  sentence_features = word_features_clean.groupby('sentence').agg(word_length=('word_length', 'mean'), concreteness=('concreteness', 'mean'), valence_avg=('valence', 'mean'), valence_max=('valence', 'max'), valence_min=('valence', 'min'), arousal_avg=('arousal', 'mean'), arousal_max=('arousal', 'max'), arousal_min=('arousal', 'min'), word_freq=('word_freq', 'sum'))
  # get book nlp features
  emotion_json = convert_to_json(f'../emotional_story_arcs/data/kelsey/results/{story}.emotion')
  features_df = pd.concat([pd.DataFrame.from_dict(emotion_json), 
              pd.read_csv(f'../emotional_story_arcs/data/kelsey/results/{story}.sentiment', sep='\t', names=['negative', 'neutral', 'positive'])], axis=1)
  features_df = pd.concat([features_df, sentence_features], axis=1)
  return (pd.concat([features_df, sentences], axis=1))

In [67]:
def resolve_nulls(arr, k, word_counts):
    new_arr = np.zeros(arr.size)
    for i in range(arr.size):
        if arr[i] > 0 and np.isnan(arr[i]) == False:
            new_arr[i] = arr[i]
        else:
            lower = i-k if i>k else 0
            upper = i+k if i+k < arr.size else arr.size
            new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
            if(new_arr[i] == 0):
                new_arr[i] = np.nanmean(arr)*(word_counts[i]/np.nanmean(word_counts))
    return new_arr

In [80]:
def process_logs(filename, features, words, OUTLIER_COUNT):
  # append story text to eyelink file
  eyelink_data = pd.read_csv(f'ia_files/{filename}.txt', sep='\t', low_memory=False)
  eyelink_data2 = eyelink_data
  words_df = pd.DataFrame(words)

  ia_df_subset = eyelink_data[['IA_DWELL_TIME', 'IA_REGRESSION_PATH_DURATION', 'IA_AVERAGE_FIX_PUPIL_SIZE', 
        'IA_REGRESSION_IN_COUNT','IA_REGRESSION_OUT_FULL_COUNT']] \
        .applymap(lambda x: None if x == '.' else x)


  ia_df_clean = pd.DataFrame(ia_df_subset, dtype='float')
  eyelink_with_text = pd.concat([words_df, ia_df_clean], axis=1)
#   lower_array, upper_array = get_iqr(eyelink_with_text, 'IA_DWELL_TIME')
  Q1 = np.percentile(eyelink_with_text['IA_DWELL_TIME'], 25,
                       method = 'midpoint')
  Q3 = np.percentile(eyelink_with_text['IA_DWELL_TIME'], 80,
                       method = 'midpoint')
  IQR = Q3 - Q1
    # Upper bound
  upper=Q3+1.5*IQR
  upper_array=eyelink_with_text[eyelink_with_text['IA_DWELL_TIME'] >= upper].index
  total_words=eyelink_with_text.shape[0]  
  OUTLIER_COUNT += upper_array.size
# Removing the outliers
  eyelink_with_text.drop(upper_array,inplace=True)
  # aggregate columns to get sentence vals
  ia_sentences_df = eyelink_with_text.groupby('sentence')\
      .agg({ 'IA_DWELL_TIME': 'sum', 'IA_REGRESSION_PATH_DURATION': 'sum',
                    'IA_AVERAGE_FIX_PUPIL_SIZE': 'mean', 'IA_REGRESSION_IN_COUNT': 'sum',
                    'IA_REGRESSION_OUT_FULL_COUNT': 'sum', 'word': 'count'}, dtype='float')

  # set null values to the average of the surrounding values
  k = 5
  dwell_time = resolve_nulls(ia_sentences_df['IA_DWELL_TIME'].to_numpy(dtype='float'), k, ia_sentences_df['word'].to_numpy(dtype='float'))
  regression_path_duration = resolve_nulls(ia_sentences_df['IA_REGRESSION_PATH_DURATION'].to_numpy(dtype='float'), k, ia_sentences_df['word'].to_numpy(dtype='float'))
  avg_pupil_size = resolve_nulls(ia_sentences_df['IA_AVERAGE_FIX_PUPIL_SIZE'].to_numpy(dtype='float'), k, ia_sentences_df['word'].to_numpy(dtype='float'))
  regression_in_count = resolve_nulls(ia_sentences_df['IA_REGRESSION_IN_COUNT'].to_numpy(dtype='float'), k, ia_sentences_df['word'].to_numpy(dtype='float'))
  regression_out_count = resolve_nulls(ia_sentences_df['IA_REGRESSION_OUT_FULL_COUNT'].to_numpy(dtype='float'), k, ia_sentences_df['word'].to_numpy(dtype='float'))
  ia_resolved_mat = np.array([dwell_time, regression_path_duration, avg_pupil_size, regression_in_count, regression_out_count]).transpose()
  eyelink_data_resolved = pd.DataFrame(ia_resolved_mat, columns=['IA_DWELL_TIME_SMOOTHED', 'IA_REGRESSION_PATH_DURATION_SMOOTHED', 'IA_AVERAGE_FIX_PUPIL_SIZE_SMOOTHED', 
        'IA_REGRESSION_IN_COUNT_SMOOTHED','IA_REGRESSION_OUT_FULL_COUNT_SMOOTHED'])

  # Get highlight categories
  highlights = pd.read_csv(f'./highlights/{filename}.csv').drop(['Unnamed: 0', 'percent_highlighted'], axis=1)
  # Story features + some eye tracking features in one DataFrame by sentence
  sentences_with_features = pd.concat([eyelink_data2[['RECORDING_SESSION_LABEL']][0:len(ia_sentences_df)], ia_sentences_df], axis=1)
  sentences_with_features['story'] = 'SM' if filename.split('_')[1].split('.')[0] == 'schoolmistress' else 'EL'
  sentences_with_features = pd.concat([highlights, sentences_with_features], axis=1)
  sentences_with_features = pd.concat([sentences_with_features, features], axis=1)
  sentences_with_features = pd.concat([sentences_with_features, eyelink_data_resolved], axis=1)
  sentences_with_features['engagement_score'] = [compute_engagement_score(filename.split('_')[0], filename.split('_')[1].replace('.txt', ''))] * len(sentences_with_features)
  sentences_with_features.to_csv(f"./results/{filename}.csv", )
  return (sentences_with_features, OUTLIER_COUNT, total_words)

In [83]:
IA_DIR = "./ia_files/"
total_words=0
schoolmistress_features = get_features_dataframe('schoolmistress', schoolmistress_words, schoolmistress_sentences)
el_features = get_features_dataframe('expensivelessons', el_words, el_sentences)
for filename in os.listdir(IA_DIR):
  f = os.path.join(IA_DIR,filename)
  if os.path.isfile(f):
      filename = f.replace(IA_DIR, '').replace('.txt', '')
      if 'schoolmistress' in filename:
        sm_feat, OUTLIER_COUNT, word_count = process_logs(filename, schoolmistress_features, schoolmistress_ia_words, OUTLIER_COUNT)
      elif 'el' in filename:
        sm_feat, OUTLIER_COUNT, word_count = process_logs(filename, el_features, el_ia_words, OUTLIER_COUNT)
      total_words+=word_count


  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))
  new_arr[i] = np.nanmean(arr[lower:upper])*(word_counts[i]/np.nanmean(word_counts[lower:upper]))


In [84]:
print(f"Expensive Lessons avg score: {np.round(engagement_total['el'] / 23, 2)}, schoolmistress avg score: {np.round(engagement_total['schoolmistress'] / 23, 2)}")


Expensive Lessons avg score: 4.19, schoolmistress avg score: 3.87


In [85]:
# fill NaN values
result_df = pd.read_csv("./results/id11_el.csv")
# fill_vals = {'IA_DWELL_TIME': 0, 'IA_REGRESSION_PATH_DURATION': 0,
#         'IA_AVERAGE_FIX_PUPIL_SIZE': result_df['IA_AVERAGE_FIX_PUPIL_SIZE'].mean(),
#         'IA_REGRESSION_IN_COUNT': 0, 'IA_REGRESSION_OUT_COUNT': 0}
        
fill_vals = {'IA_DWELL_TIME': 'empty', 'IA_REGRESSION_PATH_DURATION': 'empty',
        'IA_AVERAGE_FIX_PUPIL_SIZE': 'empty',
        'IA_REGRESSION_IN_COUNT': 'empty', 'IA_REGRESSION_OUT_FULL_COUNT': 'empty'}
result_df = result_df.fillna(value=fill_vals)
result_df = result_df[(result_df['IA_DWELL_TIME'] == 'empty') | \
         (result_df['IA_REGRESSION_PATH_DURATION'] == 'empty') | \
          (result_df['IA_AVERAGE_FIX_PUPIL_SIZE'] == 'empty') | \
            (result_df['IA_REGRESSION_IN_COUNT'] == 'empty') | \
            (result_df['IA_REGRESSION_OUT_FULL_COUNT'] == 'empty')
        ]

In [86]:
def populate_empties(df):
  fill_vals = {'IA_DWELL_TIME': 'empty', 'IA_REGRESSION_PATH_DURATION': 'empty',
        'IA_AVERAGE_FIX_PUPIL_SIZE': 'empty',
        'IA_REGRESSION_IN_COUNT': 'empty', 'IA_REGRESSION_OUT_FULL_COUNT': 'empty'}
  return df.fillna(value=fill_vals)

In [87]:
empty_report = result_df
for filename in os.listdir('./results'):
    f = os.path.join('./results',filename)
    if os.path.isfile(f) and f != './results/id11_el.csv':
      curr_df = populate_empties(pd.read_csv(f))
      curr_empties = curr_df[(curr_df['IA_DWELL_TIME'] == 'empty') | \
         (curr_df['IA_REGRESSION_PATH_DURATION'] == 'empty') | \
          (curr_df['IA_AVERAGE_FIX_PUPIL_SIZE'] == 'empty') | \
            (curr_df['IA_REGRESSION_IN_COUNT'] == 'empty') | \
            (curr_df['IA_REGRESSION_OUT_FULL_COUNT'] == 'empty')
        ]
      empty_report = pd.concat([empty_report, curr_empties], axis=0)

empty_report['tally'] = [1]*len(empty_report)
empty_report.to_csv("empty_rows.csv")


In [88]:
grouped_empty_report = empty_report.groupby('RECORDING_SESSION_LABEL').agg({"tally": 'sum'})
grouped_empty_report.to_csv("grouped_empty_report.csv")

In [89]:
# Resolve null values
