## Environment Setup and Import

In [2]:
#@markdown Please change `JUPYTER_DIR` if it is located differently in your drive. This cell will error if `JUPYTER_DIR` is incorrect.
import ipywidgets as widgets
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
from io import BytesIO
from math import pi
from math import ceil
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from zipfile import ZipFile
# mount drive
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

# Change working directory
import os
JUPYTER_DIR = '/content/drive/My Drive/Field Day/Research, Writing and PResenting/2022 GLS Jo Wilder Snark/Jupyter' #@param {type:"string"}
os.chdir(JUPYTER_DIR)
print(f'---\nCWD: {os.getcwd()}')

# Load R 
%load_ext rpy2.ipython

#@markdown Change pandas `max_rows` and `max_columns`
pd.options.display.max_columns = 4000 #@param {type:"integer"}
pd.options.display.max_rows = 60 #@param {type:"integer"}

#@markdown *Note: There may be other variables to manually change. Look the "Set Variables" section.*

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: ignored

Enjoyment (index 2, 7, 12, 17):
- The game grabs my attention.
- Time flies while I'm playing the game.
- I forget what's around me while playing the game.
- I feel emotionally involved in the game.

History (index 3, 8, 13, 14):
- I like watching TV shows about history.
- I like reading about history.
- I like learning history very much.
- I think learning history is fun.

Likability (index 4, 9, 10, 15):
- Jo is friendly.
- I like Jo.
- Jo is kind.
- I can relate to Jo.

Humor (index 5, 6, 11, 16):
- I think the characters are funny.
- The characters say things that make me laugh.
- The characters say funny things.
- The characters are entertaining.

(see https://docs.google.com/document/d/1lavYFQzQm99sGVd1TiymTh3JJVFsCcSlByg1oNPGqRM/edit)

#Filtering and File Export for R

**Feature Definitions:**
- **script_type:** a number representing the script that was used for a given session (original, earnest, snark, no snark)
- **sess_time_active:** time spent actively playing in the session
- **max_level:** the max level a player reached in the session
- **sa_index:** a number representing the response to a given survey question
- **sa_text:** the actual text representation of the response (agree, somewhat agree, neutral, somewhat disagree, disagree)

**Script Types:**
- **Dry** (no humor or snark)
- **No Humor** (includes snark)
- **No Snark** (includes humor, can be described as "obedient")
- **Normal** (base script, includes snark and humor)

In [None]:
# Get file path and create data frame
data_fpath = lambda fname: os.path.join(JUPYTER_DIR, fname)
#survey_path = 'survey_fixed.csv'
survey_path = 'Data/raw_dataset.tsv'
df = pd.read_csv(data_fpath(survey_path), sep='\t')

questions = {
    0: 'What grade are you in?',
    1: 'How well do you read in English?',
    2: 'The game grabs my attention.',
    3: 'I like watching TV shows about history.',
    4: 'Jo is friendly.',
    5: 'I think the characters are funny.',
    6: 'The characters say things that make me laugh.',
    7: 'Time flies while I\'m playing the game.',
    8: 'I like reading about history.',
    9: 'I like Jo.',
    10: 'Jo is kind.',
    11: 'The characters say funny things.',
    12: 'I forget what\'s around me while playing the game.',
    13: 'I like learning history very much.',
    14: 'I think learning history is fun.',
    15: 'I can relate to Jo.',
    16: 'The characters are entertaining.',
    17: 'I feel emotionally involved in the game.'
}

answer_map = {0.0: 4.0, 1.0: 3.0, 2.0: 2.0, 3.0: 1.0, 4.0: 0.0}

for i in range(3, len(questions), 2):
  df[f'SA{i}_sa_index'] = df[f'SA{i}_sa_index'].map(answer_map)

print(f'Sessions Before Filtering: {df.shape[0]}')

# Filter out sessions with very high or low play time
time_filter = df['sess_time_active'] > 60
df = df[time_filter]

outlier_filter = df['sess_time_active'] < 4000
df = df[outlier_filter]

# Filter out sessions that only played the first chapter
level_filter = df['max_level'] > 3
df = df[level_filter]

# Filter out sessions that continued or used a save code
save_filter = df['save_code'] == '0'
df = df[save_filter]

continue_filter = df['continue'] == 0
df = df[continue_filter]

# Filter out sessions not using the newest version of the game
version_filter = df['version'] == 10
df = df[version_filter]

# Map script type values to more descriptive labels
types = {0:'Dry', 1:'No Humor', 2:'No Snark', 3:'Normal'}
df['script_type'] = df['script_type'].map(types)

main_population = df
 
# Create a more filtered df with only stronger
reading_filter = main_population['SA1_sa_index'] >= 1.0 # only normal or strong
stronger_reader_population = main_population[reading_filter]

grade_filter = stronger_reader_population['SA0_sa_index'] > 0.0 #no 3rd 
stronger_reader_population = stronger_reader_population[grade_filter]

# Filter only sessions that completed game
completed_filter = main_population['max_level'] == 23
main_completed_population = main_population[completed_filter]
main_completed_population_groups = main_completed_population.groupby('script_type')

print(f'Base Population Count After Filtering: {main_population.shape[0]}')
print(f'Base Population who completed game Count After Filtering: {main_completed_population.shape[0]}')
print(f'Stronger Reader Population Count After Filtering: {stronger_reader_population.shape[0]}')


# Group data for comparing across script types in the main population
main_groups = main_population.groupby('script_type')

main_dry = main_groups.get_group('Dry')
main_no_humor = main_groups.get_group('No Humor')
main_no_snark = main_groups.get_group('No Snark')
main_normal = main_groups.get_group('Normal')

print(f'Main Population Dry Script Sessions: {main_dry.shape[0]}')
print(f'Main Population No Humor Script Sessions: {main_no_humor.shape[0]}')
print(f'Main Population No Snark Script Sessions: {main_no_snark.shape[0]}')
print(f'Main Population Normal Script Sessions: {main_normal.shape[0]}')

# Group data for comparing across script types in the stronger_reader population
stronger_reader_groups = stronger_reader_population.groupby('script_type')

stronger_reader_dry = stronger_reader_groups.get_group('Dry')
stronger_reader_no_humor = stronger_reader_groups.get_group('No Humor')
stronger_reader_no_snark = stronger_reader_groups.get_group('No Snark')
stronger_reader_normal = stronger_reader_groups.get_group('Normal')

print(f'Stronger Reader Population Dry Script Sessions: {stronger_reader_dry.shape[0]}')
print(f'Stronger Reader Population No Humor Script Sessions: {stronger_reader_no_humor.shape[0]}')
print(f'Stronger Reader Population No Snark Script Sessions: {stronger_reader_no_snark.shape[0]}')
print(f'Stronger Reader Population Normal Script Sessions: {stronger_reader_normal.shape[0]}')


In [None]:
# Groom and Export to Files for use in R
main_population = main_population[main_population['SA5_sa_index'].notnull()]
main_population['enjoyment_score'] = main_population[['SA2_sa_index']].mean(axis=1)
main_population['history_score'] = main_population[['SA3_sa_index']].mean(axis=1)
main_population['likability_score'] = main_population[['SA4_sa_index']].mean(axis=1)
main_population['humor_score'] = main_population[['SA5_sa_index']].mean(axis=1)
main_population['enjoyment_score'] -= 2
main_population['history_score'] -= 2
main_population['likability_score'] -= 2
main_population['humor_score'] -= 2
main_population['completion'] = np.where(main_population['max_level'] == 23, 1.0, 0.0)
main_population.to_csv('Data/main_population.csv')


stronger_reader_population = stronger_reader_population[stronger_reader_population['SA5_sa_index'].notnull()]
stronger_reader_population['enjoyment_score'] = stronger_reader_population[['SA2_sa_index']].mean(axis=1)
stronger_reader_population['history_score'] = stronger_reader_population[['SA3_sa_index']].mean(axis=1)
stronger_reader_population['likability_score'] = stronger_reader_population[['SA4_sa_index']].mean(axis=1)
stronger_reader_population['humor_score'] = stronger_reader_population[['SA5_sa_index']].mean(axis=1)
stronger_reader_population['enjoyment_score'] -= 2
stronger_reader_population['history_score'] -= 2
stronger_reader_population['likability_score'] -= 2
stronger_reader_population['humor_score'] -= 2
stronger_reader_population['completion'] = np.where(stronger_reader_population['max_level'] == 23, 1.0, 0.0)
stronger_reader_population.to_csv('Data/stronger_reader_population.csv')


In [None]:
%%R
library(tidyverse)
install.packages("psych")
library(psych)

main_population = read_csv('Data/main_population.csv')
main_population_enjoyment = main_population %>% select(SA2_sa_index, SA7_sa_index, SA12_sa_index, SA17_sa_index)
main_population_history = main_population %>% select(SA3_sa_index, SA8_sa_index, SA13_sa_index, SA14_sa_index)
main_population_likability = main_population %>% select(SA4_sa_index, SA9_sa_index, SA10_sa_index, SA15_sa_index)
main_population_humor = main_population %>% select(SA5_sa_index, SA6_sa_index, SA11_sa_index, SA16_sa_index)

stronger_readers = read_csv('Data/stronger_reader_population.csv')
stronger_readers_enjoyment = stronger_readers %>% select(SA2_sa_index, SA7_sa_index, SA12_sa_index, SA17_sa_index)
stronger_readers_history = stronger_readers %>% select(SA3_sa_index, SA8_sa_index, SA13_sa_index, SA14_sa_index)
stronger_readers_likability = stronger_readers %>% select(SA4_sa_index, SA9_sa_index, SA10_sa_index, SA15_sa_index)
stronger_readers_humor = stronger_readers %>% select(SA5_sa_index, SA6_sa_index, SA11_sa_index, SA16_sa_index)

# Testing Subscale Reliability

In [None]:
%%R
psych::alpha(main_population_enjoyment)

In [None]:
%%R
psych::alpha(main_population_history)

In [None]:
%%R
psych::alpha(main_population_likability)

In [None]:
%%R
psych::alpha(main_population_humor)

# Q1: Script Affect on Player Experience

In [None]:
%%R
library(tidyverse)
df = read_csv('Data/main_population.csv')
df = df %>%
  rename(
      grade_level = SA0_sa_index,
      reading_level = SA1_sa_index,
  )

main_population_factors = df %>% select(max_level, completion, script_type, grade_level, reading_level, enjoyment_score, history_score, likability_score, humor_score)
main_population_factors$script_type = factor(main_population_factors$script_type, levels=c("Normal", "No Snark", "No Humor", "Dry"))
main_population_factors$grade_level = factor(main_population_factors$grade_level, levels=c(0, 1, 2, 3, 4), labels=c("3rd", "4th", "5th", "6th", "Other"))
main_population_factors$reading_level = factor(main_population_factors$reading_level, levels=c(0, 1, 2), labels=c("Below Average", "Average", "Above Average"))
main_population_factors$reading_level = relevel(main_population_factors$reading_level, ref = "Average")
summary(main_population_factors)

df = read_csv('Data/stronger_reader_population.csv')
df = df %>%
  rename(
      grade_level = SA0_sa_index,
      reading_level = SA1_sa_index,
  )

stronger_reader_population_factors = df %>% select(max_level, completion, script_type, grade_level, reading_level, enjoyment_score, history_score, likability_score, humor_score)
stronger_reader_population_factors$script_type = factor(stronger_reader_population_factors$script_type, levels=c("Normal", "No Snark", "No Humor", "Dry"))
stronger_reader_population_factors$grade_level = factor(stronger_reader_population_factors$grade_level, levels=c(0, 1, 2, 3, 4), labels=c("3rd", "4th", "5th", "6th", "Other"))
stronger_reader_population_factors$reading_level = factor(stronger_reader_population_factors$reading_level, levels=c(0, 1, 2), labels=c("Below Average", "Average", "Above Average"))
stronger_reader_population_factors$reading_level = relevel(stronger_reader_population_factors$reading_level, ref = "Average")
summary(stronger_reader_population_factors)

In [None]:
print('Main Population - Enjoyment: ', stats.f_oneway(main_population['enjoyment_score'][main_population['script_type'] == 'Dry'],
               main_population['enjoyment_score'][main_population['script_type'] == 'No Humor'],
               main_population['enjoyment_score'][main_population['script_type'] == 'No Snark'],
               main_population['enjoyment_score'][main_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=main_population['enjoyment_score'], groups=main_population['script_type'], alpha=0.05))

print('\nMain Population - Humor', stats.f_oneway(main_population['humor_score'][main_population['script_type'] == 'Dry'],
               main_population['humor_score'][main_population['script_type'] == 'No Humor'],
               main_population['humor_score'][main_population['script_type'] == 'No Snark'],
               main_population['humor_score'][main_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=main_population['humor_score'], groups=main_population['script_type'], alpha=0.05))

print('\nMain Population - Likeability', stats.f_oneway(main_population['likability_score'][main_population['script_type'] == 'Dry'],
               main_population['likability_score'][main_population['script_type'] == 'No Humor'],
               main_population['likability_score'][main_population['script_type'] == 'No Snark'],
               main_population['likability_score'][main_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=main_population['likability_score'], groups=main_population['script_type'], alpha=0.05))

print('\nMain Population - History', stats.f_oneway(main_population['history_score'][main_population['script_type'] == 'Dry'],
               main_population['history_score'][main_population['script_type'] == 'No Humor'],
               main_population['history_score'][main_population['script_type'] == 'No Snark'],
               main_population['history_score'][main_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=main_population['history_score'], groups=main_population['script_type'], alpha=0.05))



print('Strong Reader Population - Enjoyment: ', stats.f_oneway(stronger_reader_population['enjoyment_score'][stronger_reader_population['script_type'] == 'Dry'],
               stronger_reader_population['enjoyment_score'][stronger_reader_population['script_type'] == 'No Humor'],
               stronger_reader_population['enjoyment_score'][stronger_reader_population['script_type'] == 'No Snark'],
               stronger_reader_population['enjoyment_score'][stronger_reader_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['enjoyment_score'], groups=stronger_reader_population['script_type'], alpha=0.05))

print('\nStrong Reader Population - Humor', stats.f_oneway(stronger_reader_population['humor_score'][stronger_reader_population['script_type'] == 'Dry'],
               stronger_reader_population['humor_score'][stronger_reader_population['script_type'] == 'No Humor'],
               stronger_reader_population['humor_score'][stronger_reader_population['script_type'] == 'No Snark'],
               stronger_reader_population['humor_score'][stronger_reader_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['humor_score'], groups=stronger_reader_population['script_type'], alpha=0.05))

print('\nStrong Reader Population - Likeability', stats.f_oneway(stronger_reader_population['likability_score'][stronger_reader_population['script_type'] == 'Dry'],
               stronger_reader_population['likability_score'][stronger_reader_population['script_type'] == 'No Humor'],
               stronger_reader_population['likability_score'][stronger_reader_population['script_type'] == 'No Snark'],
               stronger_reader_population['likability_score'][stronger_reader_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['likability_score'], groups=stronger_reader_population['script_type'], alpha=0.05))

print('\nMain Population - History', stats.f_oneway(stronger_reader_population['history_score'][stronger_reader_population['script_type'] == 'Dry'],
               stronger_reader_population['history_score'][stronger_reader_population['script_type'] == 'No Humor'],
               stronger_reader_population['history_score'][stronger_reader_population['script_type'] == 'No Snark'],
               stronger_reader_population['history_score'][stronger_reader_population['script_type'] == 'Normal']), '\n')
print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['history_score'], groups=stronger_reader_population['script_type'], alpha=0.05))

In [None]:
# Tukeys's test of survey responses in stronger readers
def tukey_test(df, index):
  data = df[df[f'SA{index}_sa_index'].notna()]
  return pairwise_tukeyhsd(endog=data[f'SA{index}_sa_index'], groups=data['script_type'], alpha=0.05)


for i in range(0, len(questions)):
  print(f'Question {i+1}: {questions[i]}\n', tukey_test(stronger_reader_population, i), '\n')

# Q2: Script Affect on Game Time and Completion

In [None]:
print('ANOVA: ', stats.f_oneway(main_population['sess_time_active'][main_population['script_type'] == 'Dry'],
               main_population['sess_time_active'][main_population['script_type'] == 'No Humor'],
               main_population['sess_time_active'][main_population['script_type'] == 'No Snark'],
               main_population['sess_time_active'][main_population['script_type'] == 'Normal']), '\n')

print('Tukey: ', pairwise_tukeyhsd(endog=main_population['sess_time_active'], groups=main_population['script_type'], alpha=0.05))

In [None]:
print('ANOVA: ', stats.f_oneway(stronger_reader_population['sess_time_active'][stronger_reader_population['script_type'] == 'Dry'],
               stronger_reader_population['sess_time_active'][stronger_reader_population['script_type'] == 'No Humor'],
               stronger_reader_population['sess_time_active'][stronger_reader_population['script_type'] == 'No Snark'],
               stronger_reader_population['sess_time_active'][stronger_reader_population['script_type'] == 'Normal']), '\n')

print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['sess_time_active'], groups=stronger_reader_population['script_type'], alpha=0.05))

In [None]:
print('ANOVA: ', stats.f_oneway(main_population['max_level'][main_population['script_type'] == 'Dry'],
                                main_population['max_level'][main_population['script_type'] == 'No Humor'],
                                main_population['max_level'][main_population['script_type'] == 'No Snark'],
                                main_population['max_level'][main_population['script_type'] == 'Normal']), '\n')

print('Tukey: ', pairwise_tukeyhsd(endog=main_population['max_level'], groups=main_population['script_type'], alpha=0.05))

In [None]:
print('ANOVA: ', stats.f_oneway(stronger_reader_population['max_level'][stronger_reader_population['script_type'] == 'Dry'],
                                stronger_reader_population['max_level'][stronger_reader_population['script_type'] == 'No Humor'],
                                stronger_reader_population['max_level'][stronger_reader_population['script_type'] == 'No Snark'],
                                stronger_reader_population['max_level'][stronger_reader_population['script_type'] == 'Normal']), '\n')

print('Tukey: ', pairwise_tukeyhsd(endog=stronger_reader_population['max_level'], groups=stronger_reader_population['script_type'], alpha=0.05))

# Q3 Factors that influence progress

In [None]:
%%R
# Main Population Poisson regression on max level reached
pois = glm(max_level ~ script_type + grade_level + reading_level + enjoyment_score + history_score + likability_score + humor_score, data = main_population_factors, family = poisson(link="log"))
summary(pois)

In [None]:
%%R
# Stronger Readers Poisson regression on max level reached
pois = glm(max_level ~ script_type + grade_level + reading_level + enjoyment_score + history_score + likability_score + humor_score, data = stronger_reader_population_factors, family = poisson(link="log"))
summary(pois)

# Q4 Factors that influence Enjoyment

In [None]:
%%R
# Main Population  Linear regression on enjoyment score
enjoyment = lm(enjoyment_score ~ script_type + grade_level + reading_level + history_score + likability_score + humor_score, data = main_population_factors)
summary(enjoyment)

In [None]:
%%R
# Stronger Reader Linear regression on enjoyment score
enjoyment = lm(enjoyment_score ~ script_type + grade_level + reading_level + history_score + likability_score + humor_score, data = stronger_reader_population_factors)
summary(enjoyment)