In [1]:
import open_scoring as ocs
import pandas as pd

Here's how the original and reshaped data look:

In [3]:
data = ocs.file.WideData('../Measurement study/Participant level data/AlternateUses.xls')
data._original.sample(1)

Unnamed: 0,participant,book,rope,fork,table,pants,bottle,brick,tire,shovel,shoe
86,87.0,weight\npress to flatten things\nfuel for a fi...,macrame\nswing\nunwind into smaller fibers for...,comb hair\nturn into jewelry\nstick in ground ...,"take off legs, use top as a panel, door, etc\n...",cut apart to use for sewing projects\ntie legs...,"fill with water, turn upside down in soil for ...",paperweight\ndoorstop\nuse with boards for boo...,use as a planter\nuse as a tire swing\nshred r...,use as a writing surface\nuse as a post\nuse h...,planter\nuse soles for impact bumpers\nuse sol...


In [4]:
data.df.sample(2)

Unnamed: 0,participant,prompt,response_num,response
10075,48.0,shoe,10,paint canvas
2013,82.0,rope,2,to tie a knot


## Scoring

Scoring is done with the AUT_Scorer class.

You need to point to your vector space models. TASA and EN_100k are available to download at [http://www.lingexp.uni-tuebingen.de/z2/LSAspaces/](http://www.lingexp.uni-tuebingen.de/z2/LSAspaces/).* GloVe is available at [https://nlp.stanford.edu/projects/glove/](https://nlp.stanford.edu/projects/glove/).


- Günther, F., Dudschig, C., & Kaup, B. (2015). LSAfun - An R package for computations based on Latent Semantic Analysis. *Behavior Research Methods*, 47, 930-944.
- Jeffrey Pennington, Richard Socher, and Christopher D. Manning. (2014). GloVe: Global Vectors for Word Representation.

In [5]:
scorer = ocs.scoring.AUT_Scorer()
scorer.load_model('EN_100_lsa', '/data/tasa/EN_100k.wv')
scorer.load_model('TASA_lsa', '/data/tasa/TASA.wv')

In [6]:
print("EN_100")
for response in ['wear', 'planter', 'hit']:
    print(response, scorer.originality('shoe', response, 'EN_100_lsa'))
    
print("TASA")
for response in ['wear', 'planter', 'hit']:
    print(response, scorer.originality('shoe', response, 'TASA_lsa'))

EN_100
wear 0.3179017901420593
planter 0.6076052784919739
hit 0.6233652234077454
TASA
wear 0.695365846157074
planter 0.8889747187495232
hit 0.9890701612457633


## Scoring at the Dataset Level

The WideData class takes a scorer and will add an internal column:

In [11]:
data.score(scorer, 'EN_100_lsa', stop=True, idf=True)
data.df.sample(3)

Unnamed: 0,participant,prompt,response_num,response,EN_100_lsa_stop_idf
2012,81.0,rope,2,macrame,
7423,64.0,book,8,decor,0.82144
280,5.0,table,0,use as bed,0.422178


You can also have it automatically score all the models that the scorer has loaded.

In [12]:
data.score_all(scorer)

Scoring EN_100_lsa
Column EN_100_lsa_stop_idf already exists. Re-crunching and re-writing.
Scoring TASA_lsa


In [13]:
data.df

Unnamed: 0,participant,prompt,response_num,response,EN_100_lsa_stop_idf,TASA_lsa_stop_idf
0,1.0,book,0,read,0.246462,0.727653
920,1.0,book,1,paperweight,0.774843,0.998924
1840,1.0,book,2,weapon,0.774123,0.950826
2760,1.0,book,3,to balance out a table,0.731356,0.971571
3680,1.0,book,4,as a foot rest,0.689794,1.005364
...,...,...,...,...,...,...
1655,92.0,tire,1,use as a garden bed for a plant,0.706723,0.997229
2575,92.0,tire,2,run with it for exercise,0.659096,0.775762
3495,92.0,tire,3,make a fire in the middle,0.619846,1.053179
4415,92.0,tire,4,pore concrete in middle to hold a basketball goal,0.669180,1.017493


## Export back to CSV

In [14]:
temp = data.to_wide()
temp.head(2)

Unnamed: 0_level_0,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1.0,0.643316,0.689446,0.674672,0.567717,0.464501,0.454914,0.608675,0.577801,0.531981,0.540985,0.930867,0.999952,1.029613,0.99245,0.809373,0.91534,1.010622,0.98853,0.914342,0.923325
2.0,0.596396,0.638455,0.58952,0.656154,0.528627,0.565885,0.582964,0.546887,0.563058,0.555761,0.945535,0.927185,0.875027,0.996902,0.791361,0.876969,0.926443,0.861939,0.967638,0.861613


You can save this with `data.to_wide().to_csv('/path/to/file')`

You can also aggregate multiple items to a response differently. The default is 'mean', but 'max', 'min', and custom functions can be passed. e.g.

In [15]:
temp = data.to_wide(aggfunc='max')
temp.head(2)

Unnamed: 0_level_0,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1.0,0.774843,0.727939,0.772552,0.663219,0.708636,0.651324,0.701918,0.692951,0.692537,0.661725,1.005364,1.032692,1.062845,1.037111,0.980114,1.007629,1.089868,1.011251,1.061688,1.030699
2.0,0.782883,0.763815,0.824,0.761792,0.706993,0.774488,0.754903,0.74788,0.669426,0.816525,1.022673,1.037401,1.029825,1.088801,0.92079,1.071285,1.015326,1.062023,1.066731,1.025842


## Fluency Data

In [16]:
f = data.fluency()
f.sample(3)

Unnamed: 0,participant,prompt,count
51,6.0,bottle,4
431,44.0,tire,10
83,9.0,fork,5


In [17]:
f = data.fluency(wide=True)
f.sample(2)

prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
79.0,2,2,0,2,2,2,0,0,2,0
38.0,9,8,7,8,6,6,7,7,7,6


## Elaboration Data

By default, this splits on whitespace and counts.

In [18]:
f = data.elaboration()
f.sample(3)

Unnamed: 0,participant,prompt,elaboration
7522,71.0,rope,4
808,73.0,shovel,2
14399,48.0,brick,3


In [19]:
f = data.elaboration(wide=True)
f.sample(2)

Unnamed: 0_level_0,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
39.0,4.8,2.25,3.333333,7.4,4.333333,4.75,7.5,,5.666667,4.0
83.0,1.833333,1.666667,1.5,2.666667,2.5,,2.0,,1.5,1.666667
