<a href="https://colab.research.google.com/github/massivetexts/llm_aut_study/blob/main/notebook/Process_AUT_GT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook collects, normalizes and processes AUT human judged ground truth from various sources. This is the data used in [Beyond Semantic Distance: Automated Scoring of Divergent Thinking Greatly Improves with Large Language Models](https://www.researchgate.net/publication/363456838_Beyond_Semantic_Distance_Automated_Scoring_of_Divergent_Thinking_Greatly_Improves_with_Large_Language_Models/stats).

In [None]:
#@title import and setup defs
import pandas as pd
import os
import numpy as np
import hashlib
from pathlib import Path
from sklearn.model_selection import train_test_split
import json

!pip -qq install pingouin
import pingouin as pg

def simple_stats(df, rater_cols=False):
    print("# of prompts", len(df.prompt.unique()))
    print("# of participants", len(df.participant.unique()))
    print("# of data points", len(df))
    print("Prompts", df.prompt.unique())
    if rater_cols:
        print("# of raters", len(rater_cols))
        print("Intraclass correlation coefficients (report ICC2k)")
        x = df[['id']+rater_cols].melt(id_vars='id', var_name='rater', value_name='rating')
        icc = pg.intraclass_corr(data=x, targets='id', raters='rater', ratings='rating', nan_policy='omit')
        display(icc.round(2))

include_rater_std = True #@param {type:'boolean'}
returncols = ['src', 'question', 'prompt', 'response', 'id', 'target', 'participant', 'response_num']
if include_rater_std:
    returncols += ['rating_std']

def prep_general(data, src, rater_cols=None, return_full=False,
                 drop_noresponse=True, include_stats=True, round_adjust=False,
                 include_rater_std=False, overwrite_q=False):
    ''' General cleaning that repeats for multiple datasets ''' 
    if not rater_cols:
        # assume columns that say 'rater'
        rater_cols = [col for col in data.columns if 'rater' in col.lower()]

    print("Rater cols:", rater_cols)

    if drop_noresponse:
        data = data[~data.response.isna()]

    data['src'] = src
    data['avg_rating'] = data[rater_cols].mean(1)
    if round_adjust:
        # add a tiny bit to avg rater to round in direction of median. Only need if there are tiebreakers
        data['median_rating'] = data[rater_cols].median(1)
        data['avg_rating'] = data.avg_rating + (data.median_rating - data.avg_rating).div(10**3)
    if include_rater_std:
        data['rating_std'] = data[rater_cols].std(1)
    
    data['target'] = normalize_values(data.avg_rating)

    missing = data.target.isna()
    if missing.sum():
        print(f'Dropping {missing.sum()} unrated items')
        data = data[~missing]

    data['participant'] = src + data['participant'].astype(str)
    if ('question' not in data.columns) or overwrite_q:
        data['question'] = data.prompt.apply(lambda x: f"What is a surprising use for a {x.upper()}?")
    
    idhash = (data.participant+data.response).apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[:4])
    # task can be a custom string that is used instead of prompt (e.g. 'g2_red' instead of 'red')
    data['id'] = f'{src}_' + data['task' if 'task' in data.columns else 'prompt'].astype(str) + '-' + idhash

    if include_stats:
        simple_stats(data, rater_cols)

    if return_full:
        return data
    else:
        cols = ['src', 'question', 'prompt', 'response', 'id', 'target', 'participant', 'response_num']
        if include_rater_std:
            cols += ['rating_std']
        return data[cols]

datasets = dict()

[K     |████████████████████████████████| 185 kB 8.3 MB/s 
[K     |████████████████████████████████| 9.8 MB 32.3 MB/s 
[?25h  Building wheel for pingouin (setup.py) ... [?25l[?25hdone
  Building wheel for littleutils (setup.py) ... [?25l[?25hdone


In [None]:
base_dir = 'drive/MyDrive/Grants/MOTES/Data/aut_ground_truth' #@param { type: 'string' }
base_dir = Path(base_dir)

In [None]:
#@title Normalization Settings
#@markdown Ensure that all the datasets have the same scale
scale_min = 1 #@param{type:'number'}
scale_max = 5 #@param{type:'number'}
#@markdown Round to nearest 0.1 by default.
round_func = lambda x: x.round(1) #@param ["lambda x: x.multiply(2).round(0).div(2)", "None", "lambda x: x.round(1)"] {type:"raw"}

def normalize_values(series, outrange=(1,5), oldrange=None,
                     round_func=round_func):
    min, max = outrange
    if oldrange:
        oldmin, oldmax = oldrange
    else:
        oldmin, oldmax = series.min(), series.max()

    x = (series - oldmin)/(oldmax-oldmin)
    x = min + (max-min)*x
    if round_func:
        # none of our data has even numbers of raters, so there shouldn't be need for rounding tiebreakers
        # but for future proofing, add or subtrack a tiny number randomly
        modifier = (2*np.random.randint(0,2,size=x.shape)-1)/10**5
        x = round_func(x+modifier)
    return x



## Normalize Dumas et al 2020

Download from OSF.

In [None]:
!wget -q -O dod20.csv https://osf.io/download/u3yv4/

src = 'dod20'
paca = pd.read_csv('dod20.csv', index_col=0)
paca = paca[~paca.response.str.contains('!!!')]
datasets[src] = prep_general(paca, src, include_rater_std=include_rater_std)

Rater cols: ['rater1', 'rater2', 'rater3', 'rater4']
# of prompts 10
# of participants 92
# of data points 5490
Prompts ['book' 'bottle' 'brick' 'fork' 'pants' 'rope' 'shoe' 'shovel' 'table'
 'tire']
# of raters 4
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.57,6.25,5349,16050,0.0,"[0.55, 0.58]"
1,ICC2,Single random raters,0.58,7.75,5349,16047,0.0,"[0.49, 0.65]"
2,ICC3,Single fixed raters,0.63,7.75,5349,16047,0.0,"[0.62, 0.64]"
3,ICC1k,Average raters absolute,0.84,6.25,5349,16050,0.0,"[0.83, 0.85]"
4,ICC2k,Average random raters,0.84,7.75,5349,16047,0.0,"[0.79, 0.88]"
5,ICC3k,Average fixed raters,0.87,7.75,5349,16047,0.0,"[0.87, 0.88]"


## Process Silvia et 2009

Silvia, P. J., Nusbaum, E. C., Berg, C., Martin, C., & O'Connor, A. (2009). Openness to experience, plasticity, and creativity: Exploring lower-order, high-order, and interactive effects. Journal of Research in Personality, 43(6), 1087–1090. https://doi.org/10.1016/j.jrp.2009.04.015

In [None]:
!wget -q -O silvia09.csv https://osf.io/download/qdrv8/
src = 'snbmo09'
data = pd.read_csv('silvia09.csv').rename(columns={'subject':'participant','response_order':'response_num'})
data['prompt'] = data.task.apply(lambda x: x.split('_')[-1])
datasets[src] = prep_general(data, src, include_rater_std=include_rater_std)

Rater cols: ['rater_1', 'rater_2', 'rater_3', 'rater_4']
Dropping 10 unrated items
# of prompts 3
# of participants 202
# of data points 4099
Prompts ['brick' 'knife' 'box']
# of raters 4
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.33,2.97,4059,12180,0.0,"[0.31, 0.35]"
1,ICC2,Single random raters,0.36,4.1,4059,12177,0.0,"[0.25, 0.45]"
2,ICC3,Single fixed raters,0.44,4.1,4059,12177,0.0,"[0.42, 0.45]"
3,ICC1k,Average raters absolute,0.66,2.97,4059,12180,0.0,"[0.65, 0.68]"
4,ICC2k,Average random raters,0.69,4.1,4059,12177,0.0,"[0.57, 0.77]"
5,ICC3k,Average fixed raters,0.76,4.1,4059,12177,0.0,"[0.74, 0.77]"


## Process Silvia et al Data 2008

From https://osf.io/dh7ey, this was the order of creativity tasks:

1. Please list all of the creative, unusual uses for a brick that you can think of.
2. Please list all of the creative, unusual instances of things that are round that you can think of.
3. Imagine that people no longer needed to sleep. Please list creative, unusual consequences that would follow.
4. Please list all of the creative, unusual uses for a knife that you can think of.
5. Please list all of the creative, unusual instances of things that will make a noise that you can think of.
6. Imagine that everyone shrank to 12 inches tall. Please list creative, unusual consequences that would follow.

Numbers 1 and 4 are AUT.



In [None]:
!pip -q install pyreadstat
!wget -q -O SilvaStudy2.zip https://files.osf.io/v1/resources/4ketx/providers/osfstorage/5dd70d1f83135e000ec3c242/?zip=
!unzip -o SilvaStudy2.zip
import pyreadstat

[K     |████████████████████████████████| 2.5 MB 8.2 MB/s 
[?25hArchive:  SilvaStudy2.zip
  inflating: DT_Responses_PACA_2008_Study_2.sav  
  inflating: Summary_Data_Average_T2_and_Snapshot_PACA_2008_Study_2.sav  
  inflating: PACA_2008_Input.inp     
  inflating: DT.dat                  


In [None]:
src = 'setal08'
promptref = ['brick', 'round', 'no sleep', 'knife', 'noise', 'shrank']
qref = ["What is a surprising use for a BRICK?",
        "What are surprising things that are ROUND?", 
        "What are surprising consequences if people needed NO SLEEP?", 
        "What is a surprising use for a KNIFE?",
        "What are surprising things that make a NOISE?",
        "What are surprising consequences if everyone SHRANK to 12 inches tall?"]
qref_dict = dict(zip(promptref, qref))

df, meta = pyreadstat.read_sav('/content/DT_Responses_PACA_2008_Study_2.sav')
df = df.rename(columns={'subject':'participant', 'order':'response_num'})
df['prompt'] = df.task.apply(lambda x: promptref[int(x)-1])
# Focus only on AUT task
df = df[df.prompt.isin(['brick', 'knife'])]
df['participant'] = df['participant'].astype(int)
# doublecheck - burczak reported ICC2k as 0.48
df = prep_general(df, src, include_stats=True, return_full=True,
                  include_rater_std=include_rater_std)
df['question'] = df.prompt.apply(lambda x: qref_dict[x])
datasets[src] = df[returncols]
datasets[src].sample()

Rater cols: ['rater1', 'rater2', 'rater3']
Dropping 7 unrated items
# of prompts 2
# of participants 241
# of data points 3425
Prompts ['brick' 'knife']
# of raters 3
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.12,1.42,3359,6720,0.0,"[0.1, 0.14]"
1,ICC2,Single random raters,0.23,2.76,3359,6718,0.0,"[0.06, 0.39]"
2,ICC3,Single fixed raters,0.37,2.76,3359,6718,0.0,"[0.35, 0.39]"
3,ICC1k,Average raters absolute,0.3,1.42,3359,6720,0.0,"[0.25, 0.34]"
4,ICC2k,Average random raters,0.48,2.76,3359,6718,0.0,"[0.15, 0.66]"
5,ICC3k,Average fixed raters,0.64,2.76,3359,6718,0.0,"[0.62, 0.66]"


Unnamed: 0,src,question,prompt,response,id,target,participant,response_num,rating_std
128,setal08,What is a surprising use for a BRICK?,brick,use as a weapon,setal08_1.0-4906,1.7,setal084,2.0,0.57735


## Process Hofelich Mohr, Sell, and Lindsay

Doublecheck ICC2k - burczak paper had icc2k=0.67

In [None]:
!wget -qO hofelich.zip https://conservancy.umn.edu/bitstream/handle/11299/172116/HMSL_CSV%20Data%20Files.zip?sequence=28&isAllowed=y
!unzip -qo hofelich.zip

In [None]:
src = 'hmsl'
df =pd.read_csv(f'HMSL_Originality_scores_all.csv')
df = df.rename(columns={'Item': 'prompt', 'QLogin_1':'participant'})
rater_cols = ['J1_Rating','J2_Rating','J3_Rating','J4_Rating']
# remove three ratings that are 11
df[rater_cols] = df[rater_cols].replace(11, pd.NA)
datasets[src] = prep_general(df, src, rater_cols=rater_cols, include_rater_std=include_rater_std)
datasets[src].sample(2)

Rater cols: ['J1_Rating', 'J2_Rating', 'J3_Rating', 'J4_Rating']
Dropping 23 unrated items
# of prompts 2
# of participants 638
# of data points 3843
Prompts ['paperclip' 'brick']
# of raters 4
Intraclass correlation coefficients (report ICC2k)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.3,2.7,3658,10977,0.0,"[0.28, 0.32]"
1,ICC2,Single random raters,0.33,3.84,3658,10974,0.0,"[0.22, 0.43]"
2,ICC3,Single fixed raters,0.42,3.84,3658,10974,0.0,"[0.4, 0.43]"
3,ICC1k,Average raters absolute,0.63,2.7,3658,10977,0.0,"[0.61, 0.65]"
4,ICC2k,Average random raters,0.67,3.84,3658,10974,0.0,"[0.53, 0.75]"
5,ICC3k,Average fixed raters,0.74,3.84,3658,10974,0.0,"[0.73, 0.75]"


Unnamed: 0,src,question,prompt,response,id,target,participant,response_num,rating_std
534,hmsl,What is a surprising use for a PAPERCLIP?,paperclip,push reset button,hmsl_paperclip-8967,3.3,hmsl0vNR3LcA,6.0,1.5
1103,hmsl,What is a surprising use for a BRICK?,brick,Doorstop,hmsl_brick-51d4,1.2,hmslrcjJj04f,7.0,0.5


## Process dataset used by Beaty and Johnson

From SemDis paper:

- Study 1 was re-analysis of AUT responses from Beaty et al., 2018 to see if ensemble approaches work better. Two tests: `box` and `rope`
   - according to their paper, using additive composition was slightly negative correlation, while multiplicative 'results revealed a large correlation between latent semantic distance and human ratings:$r=.91$, p<.001'. This uses a model that weighs the factors, but is (I think) tailored to the dataset without held out data.

- Study 2 was re-analysis of results from Silvia et al. 2017, also on box and rope 
- Study 3 was brick - yet again - via Beaty and Silvia 2012
- Study 4 and 5- Heinen and Johnson (2018) - were noun matching, not relevant here

In [None]:
!wget -qO beaty.zip https://files.osf.io/v1/resources/gz4fc/providers/osfstorage/5e45b6c73e86a800be6e662e/?zip=
!unzip -qo beaty.zip
files = sorted(list(Path('.').glob('Study*/*data_long.xlsx')))

In [None]:
for fname,src in zip(files[:3], ['betal18','snb17', 'bs12']):
    print(src.upper().center(30, '-'))
    df = pd.read_excel(fname)

    df = df.rename(columns={'id':'participant', 'item':'prompt'})   
    df['response_num'] = None
    datasets[src] = prep_general(df, src, include_rater_std=include_rater_std)

-----------BETAL18------------
Rater cols: ['rater1', 'rater2', 'rater3', 'rater4']
# of prompts 2
# of participants 171
# of data points 2918
Prompts ['box' 'rope']
# of raters 4
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.52,5.25,2864,8595,0.0,"[0.5, 0.53]"
1,ICC2,Single random raters,0.52,6.17,2864,8592,0.0,"[0.46, 0.58]"
2,ICC3,Single fixed raters,0.56,6.17,2864,8592,0.0,"[0.55, 0.58]"
3,ICC1k,Average raters absolute,0.81,5.25,2864,8595,0.0,"[0.8, 0.82]"
4,ICC2k,Average random raters,0.81,6.17,2864,8592,0.0,"[0.77, 0.85]"
5,ICC3k,Average fixed raters,0.84,6.17,2864,8592,0.0,"[0.83, 0.85]"


------------SNB17-------------
Rater cols: ['rater1', 'rater2', 'rater3']
# of prompts 2
# of participants 142
# of data points 2372
Prompts ['box' 'rope']
# of raters 3
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.38,2.82,2330,4662,0.0,"[0.35, 0.4]"
1,ICC2,Single random raters,0.4,3.57,2330,4660,0.0,"[0.29, 0.5]"
2,ICC3,Single fixed raters,0.46,3.57,2330,4660,0.0,"[0.44, 0.49]"
3,ICC1k,Average raters absolute,0.64,2.82,2330,4662,0.0,"[0.62, 0.67]"
4,ICC2k,Average random raters,0.67,3.57,2330,4660,0.0,"[0.55, 0.75]"
5,ICC3k,Average fixed raters,0.72,3.57,2330,4660,0.0,"[0.7, 0.74]"


-------------BS12-------------
Rater cols: ['br_rater1', 'br_rater2', 'br_rater3']
# of prompts 1
# of participants 133
# of data points 1807
Prompts ['brick']
# of raters 3
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.43,3.24,1775,3552,0.0,"[0.4, 0.46]"
1,ICC2,Single random raters,0.46,4.47,1775,3550,0.0,"[0.3, 0.57]"
2,ICC3,Single fixed raters,0.54,4.47,1775,3550,0.0,"[0.51, 0.56]"
3,ICC1k,Average raters absolute,0.69,3.24,1775,3552,0.0,"[0.67, 0.72]"
4,ICC2k,Average random raters,0.72,4.47,1775,3550,0.0,"[0.56, 0.8]"
5,ICC3k,Average fixed raters,0.78,4.47,1775,3550,0.0,"[0.76, 0.79]"


### Doublecheck Beaty data
Confirm that this data is what's seen in table 1 of paper - this is correct.

In [None]:
test = pd.read_excel('/content/Study 1/s1_data_agg.xlsx')
test[[col for col in test if (col.startswith('b_') or col.startswith('r_')) and not col.endswith('_a')]].corr().round(2)

Unnamed: 0,b_rater1,b_rater2,b_rater3,b_rater4,b_cbowu,b_cbows,b_cboww,b_tasa,b_glov,r_rater1,r_rater2,r_rater3,r_rater4,r_cbowu,r_cbows,r_cboww,r_tasa,r_glov
b_rater1,1.0,0.65,0.54,0.73,0.46,0.32,0.43,0.3,0.26,0.29,0.27,0.36,0.37,0.37,0.32,0.36,0.25,0.37
b_rater2,0.65,1.0,0.59,0.74,0.48,0.37,0.45,0.27,0.24,0.38,0.38,0.48,0.43,0.4,0.42,0.43,0.35,0.42
b_rater3,0.54,0.59,1.0,0.73,0.38,0.45,0.36,0.22,0.21,0.37,0.39,0.51,0.48,0.36,0.39,0.37,0.29,0.35
b_rater4,0.73,0.74,0.73,1.0,0.45,0.38,0.39,0.22,0.26,0.43,0.41,0.54,0.51,0.49,0.45,0.5,0.37,0.45
b_cbowu,0.46,0.48,0.38,0.45,1.0,0.76,0.84,0.36,0.8,0.17,0.17,0.31,0.31,0.42,0.39,0.48,0.28,0.43
b_cbows,0.32,0.37,0.45,0.38,0.76,1.0,0.69,0.32,0.57,0.1,0.13,0.23,0.21,0.35,0.34,0.44,0.19,0.4
b_cboww,0.43,0.45,0.36,0.39,0.84,0.69,1.0,0.41,0.71,0.09,0.1,0.2,0.21,0.41,0.4,0.46,0.17,0.45
b_tasa,0.3,0.27,0.22,0.22,0.36,0.32,0.41,1.0,0.31,-0.05,0.06,0.08,0.1,0.1,0.13,0.15,0.02,0.12
b_glov,0.26,0.24,0.21,0.26,0.8,0.57,0.71,0.31,1.0,0.07,0.08,0.19,0.22,0.37,0.31,0.39,0.22,0.33
r_rater1,0.29,0.38,0.37,0.43,0.17,0.1,0.09,-0.05,0.07,1.0,0.65,0.74,0.74,0.32,0.33,0.32,0.33,0.29


## Normalize MOTES Pilot Data

Score on multi-point scale.

MOTES is related to the "Measuring Original Thinking in Elementary Students: A Text-Mining Approach" (IES #R305A200519). This dataset is very, very new and not yet ready to share. Hopefully it is open access by the time our first LLM paper is published, until then I think we can share with researchers by request (contact <peter.organisciak@du.edu> and/or <selcuk.acar@unt.edu>)

In [None]:
src = 'motesp'
motes_pilot = pd.read_csv(os.path.join(base_dir, 'motes_pilot_gt_scores.csv')).rename(columns=dict(D='rater1', K='rater2', T='rater3', ID='participant'))
motes_pilot['response_num'] = None
motes_pilot = prep_general(motes_pilot, src, include_stats=False, return_full=True, include_rater_std=include_rater_std)
# Normalize question styles
motes_pilot.question = (motes_pilot.question.str.replace("What would be a surprising", "What is a surprising", regex=True)
                            .str.replace("Think of an example of something that is (.*)", "What is a surprising example of something \\1?", regex=True)
                            .str.replace("(When .*)", "\\1...", regex=True)
)
datasets[src] = motes_pilot[returncols]

simple_stats(motes_pilot[motes_pilot.id.str.contains('_g1_')], ['rater1', 'rater2', 'rater3'])

Rater cols: ['rater1', 'rater2', 'rater3']
# of prompts 10
# of participants 35
# of data points 339
Prompts ['backpack' 'ball' 'bottle' 'hat' 'lightbulb' 'pencil' 'shoe' 'sock'
 'spoon' 'toothbrush']
# of raters 3
Intraclass correlation coefficients (report ICC2k)


Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.58,5.15,338,678,0.0,"[0.52, 0.63]"
1,ICC2,Single random raters,0.58,5.28,338,676,0.0,"[0.52, 0.64]"
2,ICC3,Single fixed raters,0.59,5.28,338,676,0.0,"[0.53, 0.64]"
3,ICC1k,Average raters absolute,0.81,5.15,338,678,0.0,"[0.77, 0.84]"
4,ICC2k,Average random raters,0.81,5.28,338,676,0.0,"[0.77, 0.84]"
5,ICC3k,Average fixed raters,0.81,5.28,338,676,0.0,"[0.77, 0.84]"


## Process MOTES Main Data

This is the post-pilot data. Only including Game 1 here, which is AUT - future work will absolutely expand to other divergent thinking tasks: AUT is the focus right now simply for research clarity - there are many initial LLM questions to study so a narrower scope is needed to be able to give those questions proper attention.

As with the pilot data, this dataset is not yet released, but please reach out.

In [None]:
df = pd.read_excel('MOTES 2021-2022 Data FINAL.xlsx', sheet_name='Combined Data FINAL').replace(-999, pd.NA)
# first thing first - drop sensitive columns because they're not needed here
df = df.drop(columns=['Campus', 'Grade', 'Classroom'])
df['participant'] = df.Order.astype(str).apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[:4])

In [None]:
src = 'motesf'
items = [col.replace('_prompt', '') for col in df.columns if col.startswith('G') and col.endswith('_prompt')]

collector = []
# RESHAPE TO long
for item in items:
    subset = df[['participant'] + [col for col in df.columns if col.startswith(item)]].copy()
    subset.columns = [col.split('_')[-1] for col in subset.columns]
    subset['game'] = item.split('_')[0]
    subset['prompt_code'] = item
    collector.append(subset)
reshaped = pd.concat(collector)
# remove non-responses
reshaped = reshaped[~reshaped.raw.isna()]
# restore original wording in the test
reshaped.prompt =reshaped.prompt.str.replace('light bulbs', 'lightbulb').str.replace('hat cap', 'hat').str.replace('soccer ball', 'ball').str.replace('lead pencil', 'pencil').str.replace('spoons', 'spoon')

# add display order
displayorder = df[['participant'] + [col for col in df.columns if 'DO' in col]]
displayorder = displayorder.melt(id_vars='participant', value_name='prompt_code')
displayorder['response_num'] = displayorder.variable.apply(lambda x:x[-1])
reshaped = reshaped.merge(displayorder[['participant', 'prompt_code', 'response_num']])
# use spelling corrected response
reshaped = reshaped.rename(columns={'corrected':'response'})
# FOCUS JUST ON GAME 1
reshaped = reshaped[reshaped.game == 'G1']

# IF EXPANDING PAST G1, NEED PROPER QUESTION FORMATS
datasets[src] = prep_general(reshaped, src,
                             rater_cols=[col for col in reshaped if 'score' in col.lower()],
                             include_rater_std=include_rater_std)
datasets[src].sample()

## Combine data and basic stats

Check correlation among ratings for responses which have been submitted more than once. Here, I sample one rating vs mean of all the ratings for a duplicate response. This contextualizes the max what a model might be able to do - if humans can't agree (sometimes with themselves!), then it would be impossible for a model to do so.

In [None]:
#@markdown Average rating-to-rating correlation on duplicates
# run multiple times with different samples
from tqdm import trange
corrs = []
for i in trange(1000):
    check_dupe_corr = pd.concat(datasets.values()).sample(frac=1, random_state=i**2)
    just_duped = check_dupe_corr[check_dupe_corr[['prompt', 'response']].duplicated(keep=False)]
    first = just_duped.drop_duplicates(['prompt', 'response'], keep='first')
    last = just_duped.drop_duplicates(['prompt', 'response'], keep='last')
    merged = first.merge(last[['prompt', 'response', 'target']], how='inner', on=['prompt', 'response'])
    corr = merged[['target_x', 'target_y']].corr().values[0,1]
    corrs.append(corr)
print("\nAverage correlation among duplicates", sum(corrs)/len(corrs))

100%|██████████| 1000/1000 [00:55<00:00, 17.86it/s]


Average correlation among duplicates 0.8284398822991509





In [None]:
#@markdown Average rating-to-mean(other ratings) correlation on duplicates
corrs = []
for i in trange(1000):
    check_dupe_corr = pd.concat(datasets.values()).sample(frac=1)
    means_of_dupes = check_dupe_corr[check_dupe_corr[['prompt', 'response']].duplicated()].groupby(['prompt','response'], as_index=False).target.mean().round(2)
    corr = check_dupe_corr.merge(means_of_dupes[['prompt', 'response', 'target']], on=['prompt', 'response']).corr().loc['target_x', 'target_y']
    corrs.append(corr)
print("\nAverage correlation among duplicates", sum(corrs)/len(corrs))

100%|██████████| 1000/1000 [01:09<00:00, 14.39it/s]


Average correlation among duplicates 0.8800950237339823





Merge ratings for items with duplicates, so that a response that has been rated multiple times has the average of all instances as it's ground truth.

In [None]:
data = pd.concat(datasets.values()).sample(frac=1, random_state=1234)
# Fix prompt grammar and spelling
data.question = data.question.str.replace('a (PANTS)', r'\1', regex=True).str.replace('LIGHTBULB', 'LIGHT BULB')
#data.prompt = data.prompt.str.replace('lightbulb', 'light bulb')

aut_only = True #@param {type:'boolean'}
if aut_only:
    exclude_from_ids = ['motesp_g2', 'motesp_g3', 'motesp_g4', 'setal08-noise', 'setal08-round', 'setal08-no sleep', 'setal08-shrank']
    for snippet in exclude_from_ids:
        data = data[~data.id.str.contains(snippet)]

drop_missing = True #@param {type:'boolean'}
if drop_missing:
    data = data[~data.target.isna()]
print(f"Pre-dedupe data size is {len(data)} items")
print(f'# of unique participants is {len(data.participant.unique())}')
print("# of unique prompts", len(data.prompt.unique()))

display(data['src'].value_counts())

combine_gt_for_dupes = True #@param {type:'boolean'}
if combine_gt_for_dupes:
    dupe_means = data.groupby(['prompt', 'response'], as_index=False).target.aggregate(['count', 'mean', 'std']).reset_index().round(1)
    dupe_means = dupe_means[dupe_means['count'] > 1]
    data = data.merge(dupe_means, how='left')
    data.loc[~data['mean'].isna(), 'target'] = data['mean'].dropna() # set target to be mean
    data = data.drop(columns=['mean', 'std'])

ensure_no_dupes = True #@param {type: 'boolean'}
og_size = len(data)
if ensure_no_dupes:
        data = data.drop_duplicates(['prompt', 'response'])
        print(f"Dropped {og_size-len(data)} duplicate items. {100*(og_size-len(data))/og_size:.2f}%")
        print(f"Final data size is {len(data)} items")

print('\nproportion of representation by each prompt')
print((data.prompt.value_counts() / len(data)).multiply(100).round(2))

print("\nNumber of datasets with each prompt")
print(data[['prompt', 'src']].drop_duplicates().prompt.value_counts())

Pre-dedupe data size is 27217 items
# of unique participants is 2039
# of unique prompts 21


dod20      5490
snbmo09    4099
hmsl       3843
setal08    3425
motesf     2924
betal18    2918
snb17      2372
bs12       1807
motesp      339
Name: src, dtype: int64

Dropped 7015 duplicate items. 25.77%
Final data size is 20202 items

proportion of representation by each prompt
brick         25.68
box           14.14
knife         10.78
rope          10.35
paperclip      6.88
bottle         4.15
book           2.42
table          2.29
pants          2.19
tire           2.04
fork           2.01
ball           1.95
spoon          1.91
lightbulb      1.91
pencil         1.91
shoe           1.90
hat            1.89
sock           1.88
toothbrush     1.88
shovel         1.68
backpack       0.17
Name: prompt, dtype: float64

Number of datasets with each prompt
brick         5
rope          3
box           3
bottle        3
spoon         2
lightbulb     2
ball          2
toothbrush    2
shoe          2
sock          2
pencil        2
knife         2
hat           2
shovel        1
fork          1
table         1
pants         1
tire          1
book          1
paperclip     1
backpack      1
Name: prompt, dtype: int64


In [None]:
from scipy.stats import entropy
data.groupby('prompt').target.apply(entropy).sort_values().round(3).to_dict()

## Save Dataset

In [None]:
name = 'gt_main_std' #@param {type:'string'}
random_seed = 987 #@param {type:'number'}

# ensure subfolder to avoid rm'ing something dumb
data_dir = os.path.join('data', name)
!rm -rf $data_dir

print(f'####{name}')
print(f"- Data size is {len(data)} items")
types = ['train', 'val', 'test']
os.makedirs(data_dir, exist_ok=True)
splits = {'train': 80, 'val': 5, 'test': 15} #@param {type: 'raw'}
print('seed', random_seed)
print('targetsplits', splits)
#@markdown ### Data to Include

#@markdown ### Parameters
#@markdown If splitting data by participant, individuals will be entirely in a single split.
#@markdown If paired with ensure_no_dupes, the test is split out, then the train data is deduped
#@markdown so no test data is seen. This will imbalance the split sizes from expectation.
split_by_participant = False #@param {type: 'boolean'}
#@markdown Split where test data has different prompts than train.
split_by_prompt = False #@param {type: 'boolean'}
try:
    assert not (split_by_participant and split_by_prompt)
except:
    raise AssertionError("Can't split by participant *and* prompt")

print(f'split_by_part: {split_by_participant}; split_by_prompt: {split_by_prompt}')

#@markdown `dry_run`: don't save outputs
dry_run = False #@param {type: 'boolean'}

examples = dict()

def fingerprint(x):
    x = x.lower()
    x = "".join([c for c in x if c.isalpha()])
    return x

if split_by_participant or split_by_prompt:

    sample = dict()
    splitcol = 'participant' if split_by_participant else 'prompt'
    if split_by_participant:
        unique_vals = data[splitcol].unique().tolist()

        sample['train'], sample['test'] = train_test_split(unique_vals, train_size=splits['train']/sum(splits.values()), shuffle=True, random_state=random_seed)
        sample['val'], sample['test']  = train_test_split(sample['test'], train_size=splits['val']/(splits['val']+splits['test']), shuffle=True, random_state=random_seed)
    elif split_by_prompt:
        # most common prompts
        promptn = data.prompt.value_counts() / len(data)
        # put aside the n most plentiful prompts, and add them to train later
        sample['train'] = promptn[:4].index.tolist()
        # recalc, to use splits based on the rest of the data
        #remaining = data[~data.prompt.isin(sample['train'])]
        #promptn = remaining.prompt.value_counts() / len(remaining)
        promptn = promptn[~promptn.index.isin(sample['train'])]
        
        # fill val and test, then add remainder to train
        pool = promptn.sample(frac=1, random_state=random_seed).cumsum()
        sample['val'] = pool[(pool-promptn.mean()/2) < splits['val']/100].index.tolist()
        promptn = promptn[~promptn.index.isin(sample['val'])]

        pool = promptn.sample(frac=1, random_state=random_seed).cumsum()
        sample['test'] = pool[(pool-promptn.mean()/2) < splits['test']/100].index.tolist()
        promptn = promptn[~promptn.index.isin(sample['test'])]

        sample['train'] += promptn.index.tolist()
        print('train:', sample['train'])
        print('val:', sample['val'])
        print('test:', sample['test'])

    for splittype in types:
        examples[splittype] = data[data[splitcol].isin(sample[splittype])].to_dict(orient='records')

    if ensure_no_dupes and split_by_participant: #split by prompt is already deduped
        exclude_list = [fingerprint(x['prompt']+x['response']) for x in examples['test']+examples['val']]
        examples['train_og'] = examples['train']
        examples['train'] = [x for x in examples['train'] if fingerprint(x['prompt']+x['response']) not in exclude_list]

else:
    examples['train'], examples['test'] = train_test_split(data.to_dict(orient='records'), train_size=splits['train']/sum(splits.values()), shuffle=True, random_state=random_seed)
    examples['val'], examples['test']  = train_test_split(examples['test'],
                                                        train_size=splits['val']/(splits['val']+splits['test']), shuffle=True, random_state=random_seed)

sizes = [len(examples[splittype]) for splittype in types]
print("Final split sizes:", [np.round(100*x/sum(sizes), 1) for x in sizes])

if not dry_run:
    for split_type, split in splits.items():
        os.makedirs(os.path.join(data_dir, split_type), exist_ok=True)

        for item in examples[split_type]:
            with open(os.path.join(data_dir, split_type, f"{item['id']}.json"), mode='w') as f:
                json.dump(item, f)

    !tar -czf "{name}.tar.gz" data/${name}
    !mv "{name}.tar.gz" "{base_dir}"

####gt_main_std
- Data size is 20202 items
seed 987
targetsplits {'train': 80, 'val': 5, 'test': 15}
split_by_part: False; split_by_prompt: False
Final split sizes: [80.0, 5.0, 15.0]


In [None]:
pd.DataFrame(examples['train']).src.value_counts()

dod20      3329
snbmo09    2322
motesf     2244
hmsl       2194
setal08    1944
betal18    1388
snb17      1360
bs12       1102
motesp      278
Name: src, dtype: int64

In [None]:
pd.DataFrame(examples['test']).src.value_counts()

dod20      578
hmsl       435
snbmo09    420
motesf     416
setal08    401
betal18    272
snb17      237
bs12       226
motesp      46
Name: src, dtype: int64

# Dataset Reference

####gt_main2
- Data size is 20202 items
- seed 987
- targetsplits {'train': 80, 'val': 5, 'test': 15}
- split_by_part: False; split_by_prompt: False
- Final split sizes: [80.0, 5.0, 15.0]
- (gt_main_std *should* be identical, with stdev included, but I haven't doublechecked)

####gt_byparticipant
- Data size is 20202 items
- seed 987
- targetsplits {'train': 80, 'val': 5, 'test': 15}
- split_by_part: True; split_by_prompt: False
- Final split sizes: [80.7, 4.7, 14.6]

####gt_byprompt
- Data size is 20202 items
- seed 987
- targetsplits {'train': 79, 'val': 4, 'test': 17}
- split_by_part: False; split_by_prompt: True
- train: ['brick', 'box', 'knife', 'rope', 'book', 'table', 'tire', 'fork', 'ball', 'pencil', 'lightbulb', 'shoe', 'hat', 'sock', 'toothbrush', 'backpack']
- val: []
- test: ['paperclip', 'spoon', 'bottle', 'shovel', 'pants']
- Final split sizes: [83.2, 0.0, 16.8]

####all (*not* deduped - meant for final model to share)
- Data size is 27217 items
seed 987
targetsplits {'train': 94, 'val': 1, 'test': 5}
split_by_part: False; split_by_prompt: False
Final split sizes: [94.0, 1.0, 5.0]

In [None]:
print("All gt options")
print([x.stem.split('.')[0] for x in base_dir.glob('*tar.gz')])

All gt options
['gt_nosplit1', 'gt_bypart2', 'gt_byprompt2', 'gt_bypart3', 'gt_main', 'gt_byprompt3']
