In [1]:
import file
import scoring

In [2]:
import pandas as pd

Here's how the original and reshaped data look:

In [3]:
data = file.WideData('Measurement study/Participant level data/AlternateUses.xls')
data._original.sample(1)

Unnamed: 0,participant,book,rope,fork,table,pants,bottle,brick,tire,shovel,shoe
85,86.0,Door Stopper\nPaper weight\nWeapon\nStand\nHea...,Swing\nWeapon \nLasso\n,Utensil\nWeapon\n,Stand\nWeapon\nDoor stopper\nTo eat food on,Clothes\nBlinds\nComforters,Weapon\nPaper weight\nUrine holder\n,Door Stopper\nPaper Weight\nWeapon,Stand\nSwing\n,Weapon\ndigging utensil\npoop scoop\n,Door stopper\nSlipper\nThrowing device\nWeapon


In [4]:
data.df.sample(2)

Unnamed: 0,participant,prompt,response_num,response
2526,43.0,tire,2,hold down tarp
2272,65.0,pants,2,create art projects like a wreath


## Scoring

Scoring is done with the AUT_Scorer class.

In [5]:
scorer = scoring.AUT_Scorer()
scorer.load_model('EN_100_lsa', '/data/glove/glove.840B-300d.word2vec.bin')
scorer.load_model('TASA_lsa', '/data/tasa/TASA.word2vec.bin')

In [6]:
print("EN_100")
for response in ['wear', 'planter', 'hit']:
    print(response, scorer.originality('shoe', response, 'EN_100_lsa'))
    
print("TASA")
for response in ['wear', 'planter', 'hit']:
    print(response, scorer.originality('shoe', response, 'TASA_lsa'))

EN_100
wear 0.4520634412765503
planter 0.7680058628320694
hit 0.8015732020139694
TASA
wear 0.695365846157074
planter 0.8889747262001038
hit 0.9890701556578279


## Scoring at the Dataset Level

The WideData class takes a scorer and will add an internal column:

In [7]:
data.score(scorer, 'EN_100_lsa', stop=True, idf=True)
data.df.sample(3)

Unnamed: 0,participant,prompt,response_num,response,EN_100_lsa_stop_idf
612,61.0,brick,0,garden bed edging,0.629958
3941,78.0,fork,4,stab vegetables,0.714109
2085,62.0,fork,2,picking stuff up off the floor,0.76569


You can also have it automatically score all the models that the scorer has loaded.

In [8]:
data.score_all(scorer)

Scoring EN_100_lsa
Column EN_100_lsa_stop_idf already exists. Re-crunching and re-writing.
Scoring TASA_lsa


In [9]:
data.df

Unnamed: 0,participant,prompt,response_num,response,EN_100_lsa_stop_idf,TASA_lsa_stop_idf
0,1.0,book,0,read,0.344389,0.727653
920,1.0,book,1,paperweight,0.939968,0.998924
1840,1.0,book,2,weapon,0.876172,0.950826
2760,1.0,book,3,to balance out a table,0.782391,0.971571
3680,1.0,book,4,as a foot rest,0.786366,1.005364
460,1.0,bottle,0,planter,0.767967,1.003884
1380,1.0,bottle,1,rain catcher,0.813707,1.031110
2300,1.0,bottle,2,weapon,0.822796,1.032692
3220,1.0,bottle,3,target for shooting,0.870302,0.985451
4140,1.0,bottle,4,book end paperweight,0.751551,0.946625


## Export back to CSV

In [12]:
temp = data.to_wide()
temp.head(2)

Unnamed: 0_level_0,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1.0,0.745857,0.805265,0.755974,0.805843,0.603382,0.602738,0.806814,0.743534,0.649084,0.721637,0.930867,0.999952,1.029613,0.99245,0.809373,0.91534,1.010622,0.98853,0.914342,0.923325
2.0,0.685645,0.741775,0.659536,0.806637,0.603331,0.648324,0.709257,0.672089,0.681022,0.682143,0.945535,0.927185,0.875027,0.996902,0.791361,0.876969,0.926443,0.861939,0.967638,0.861613


You can save this with `data.to_wide().to_csv('/path/to/file')`

You can also aggregate multiple items to a response differently. The default is 'mean', but 'max', 'min', and custom functions can be passed. e.g.

In [13]:
temp = data.to_wide(aggfunc='max')
temp.head(2)

Unnamed: 0_level_0,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,EN_100_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf,TASA_lsa_stop_idf
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1.0,0.939968,0.870302,0.853384,0.878103,0.772427,0.810896,0.902467,0.796727,0.820382,0.773925,1.005364,1.032692,1.062845,1.037111,0.980114,1.007629,1.089868,1.011251,1.061688,1.030699
2.0,0.839752,0.862154,0.921722,0.916474,0.757681,0.796459,0.892439,0.845521,0.811484,0.842676,1.022673,1.037401,1.029825,1.088801,0.92079,1.071286,1.015326,1.062023,1.066731,1.025842


## Fluency Data

In [15]:
f = data.fluency()
f.sample(3)

Unnamed: 0,participant,prompt,count
637,65.0,rope,6
67,7.0,shovel,1
319,33.0,shoe,4


In [16]:
f = data.fluency(wide=True)
f.sample(2)

prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
83.0,6,3,2,3,2,0,1,0,4,3
37.0,14,7,7,8,8,13,7,6,5,7


## Elaboration Data

By default, this splits on whitespace and counts.

In [14]:
f = data.elaboration()
f.sample(3)

Unnamed: 0,participant,prompt,elaboration
7784,57.0,pants,5
3668,81.0,shoe,6
7400,41.0,book,2


In [15]:
f = data.elaboration(wide=True)
f.sample(2)

Unnamed: 0_level_0,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration,elaboration
prompt,book,bottle,brick,fork,pants,rope,shoe,shovel,table,tire
participant,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
44.0,3.428571,3.333333,3.0,5.8,6.166667,2.571429,1.666667,2.25,4.2,1.9
6.0,1.2,2.25,2.333333,1.375,1.5,1.6,1.333333,2.4,2.4,1.0
